X-Git-Url: https://git.sur5r.net/?a=blobdiff_plain;f=bacula%2Fsrc%2Fdird%2Fjob.c;h=6c71c7096fc5c416c2f50acb9c602edb2d25a818;hb=4137d8ab47939bf6d83eb5153ec854c5da0f776d;hp=ec28e6e2378f9d6cc4f26a5c00c267686ee2452d;hpb=54e7991c0c57a3d379685f60c61660a0110ca0be;p=bacula%2Fbacula diff --git a/bacula/src/dird/job.c b/bacula/src/dird/job.c index ec28e6e237..6c71c7096f 100644 --- a/bacula/src/dird/job.c +++ b/bacula/src/dird/job.c @@ -7,7 +7,7 @@ * Version $Id$ */ /* - Copyright (C) 2000-2003 Kern Sibbald and John Walker + Copyright (C) 2000-2004 Kern Sibbald and John Walker This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as @@ -31,16 +31,12 @@ /* Forward referenced subroutines */ static void *job_thread(void *arg); -static char *edit_run_codes(JCR *jcr, char *omsg, char *imsg); -static void release_resource_locks(JCR *jcr); -static int acquire_resource_locks(JCR *jcr); -#ifdef USE_SEMAPHORE -static void backoff_resource_locks(JCR *jcr, int count); -#endif +static void job_monitor_watchdog(watchdog_t *self); +static void job_monitor_destructor(watchdog_t *self); +static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr); +static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr); /* Exported subroutines */ -void run_job(JCR *jcr); - /* Imported subroutines */ extern void term_scheduler(); @@ -50,71 +46,67 @@ extern int do_admin(JCR *jcr); extern int do_restore(JCR *jcr); extern int do_verify(JCR *jcr); -#ifdef USE_SEMAPHORE -static semlock_t job_lock; -static pthread_mutex_t mutex; -static pthread_cond_t resource_wait; -static int waiting = 0; /* count of waiting threads */ -#else -/* Queue of jobs to be run */ -workq_t job_wq; /* our job work queue */ -#endif +/* Imported variables */ +extern time_t watchdog_time; + +jobq_t job_queue; void init_job_server(int max_workers) { int stat; -#ifdef USE_SEMAPHORE - if ((stat = sem_init(&job_lock, max_workers)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init job lock: ERR=%s\n"), strerror(stat)); - } - if ((stat = pthread_mutex_init(&mutex, NULL)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init resource mutex: ERR=%s\n"), strerror(stat)); - } - if ((stat = pthread_cond_init(&resource_wait, NULL)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init resource wait: ERR=%s\n"), strerror(stat)); - } + watchdog_t *wd; + + if ((stat = jobq_init(&job_queue, max_workers, job_thread)) != 0) { + berrno be; + be.set_errno(stat); + Emsg1(M_ABORT, 0, _("Could not init job queue: ERR=%s\n"), be.strerror()); + } + if ((wd = new_watchdog()) == NULL) { + Emsg0(M_ABORT, 0, _("Could not init job monitor watchdogs\n")); + } + wd->callback = job_monitor_watchdog; + wd->destructor = job_monitor_destructor; + wd->one_shot = false; + wd->interval = 60; + wd->data = new_control_jcr("*JobMonitor*", JT_SYSTEM); + register_watchdog(wd); +} -#else - if ((stat = workq_init(&job_wq, max_workers, job_thread)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init job work queue: ERR=%s\n"), strerror(stat)); +void term_job_server() +{ + int stat; + if ((stat=jobq_destroy(&job_queue)) != 0) { + berrno be; + be.set_errno(stat); + Emsg1(M_INFO, 0, _("Could not term job queue: ERR=%s\n"), be.strerror()); } -#endif - return; } /* * Run a job -- typically called by the scheduler, but may also * be called by the UA (Console program). * + * Returns: 0 on failure + * JobId on success + * */ -void run_job(JCR *jcr) +JobId_t run_job(JCR *jcr) { int stat, errstat; -#ifdef USE_SEMAPHORE - pthread_t tid; -#else - workq_ele_t *work_item; -#endif + JobId_t JobId = 0; - sm_check(__FILE__, __LINE__, True); + P(jcr->mutex); + sm_check(__FILE__, __LINE__, true); init_msg(jcr, jcr->messages); - create_unique_job_name(jcr, jcr->job->hdr.name); - set_jcr_job_status(jcr, JS_Created); - jcr->jr.SchedTime = jcr->sched_time; - jcr->jr.StartTime = jcr->start_time; - jcr->jr.Type = jcr->JobType; - jcr->jr.Level = jcr->JobLevel; - jcr->jr.JobStatus = jcr->JobStatus; - bstrncpy(jcr->jr.Name, jcr->job->hdr.name, sizeof(jcr->jr.Name)); - bstrncpy(jcr->jr.Job, jcr->Job, sizeof(jcr->jr.Job)); /* Initialize termination condition variable */ if ((errstat = pthread_cond_init(&jcr->term_wait, NULL)) != 0) { - Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), strerror(errstat)); - set_jcr_job_status(jcr, JS_ErrorTerminated); - free_jcr(jcr); - return; + berrno be; + be.set_errno(errstat); + Jmsg1(jcr, M_FATAL, 0, _("Unable to init job cond variable: ERR=%s\n"), be.strerror()); + goto bail_out; } + jcr->term_wait_inited = true; /* * Open database @@ -123,62 +115,68 @@ void run_job(JCR *jcr) jcr->db=db_init_database(jcr, jcr->catalog->db_name, jcr->catalog->db_user, jcr->catalog->db_password, jcr->catalog->db_address, jcr->catalog->db_port, jcr->catalog->db_socket); - if (!db_open_database(jcr, jcr->db)) { - Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db)); - set_jcr_job_status(jcr, JS_ErrorTerminated); - free_jcr(jcr); - return; + if (!jcr->db || !db_open_database(jcr, jcr->db)) { + Jmsg(jcr, M_FATAL, 0, _("Could not open database \"%s\".\n"), + jcr->catalog->db_name); + if (jcr->db) { + Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db)); + } + goto bail_out; } Dmsg0(50, "DB opened\n"); /* * Create Job record */ - jcr->jr.JobStatus = jcr->JobStatus; + create_unique_job_name(jcr, jcr->job->hdr.name); + set_jcr_job_status(jcr, JS_Created); + init_jcr_job_record(jcr); if (!db_create_job_record(jcr, jcr->db, &jcr->jr)) { Jmsg(jcr, M_FATAL, 0, "%s", db_strerror(jcr->db)); - set_jcr_job_status(jcr, JS_ErrorTerminated); - free_jcr(jcr); - return; + goto bail_out; } - jcr->JobId = jcr->jr.JobId; - ASSERT(jcr->jr.JobId > 0); + JobId = jcr->JobId = jcr->jr.JobId; - Dmsg4(30, "Created job record JobId=%d Name=%s Type=%c Level=%c\n", - jcr->JobId, jcr->Job, jcr->jr.Type, jcr->jr.Level); + Dmsg4(100, "Created job record JobId=%d Name=%s Type=%c Level=%c\n", + jcr->JobId, jcr->Job, jcr->jr.JobType, jcr->jr.JobLevel); Dmsg0(200, "Add jrc to work queue\n"); -#ifdef USE_SEMAPHORE - if ((stat = pthread_create(&tid, NULL, job_thread, (void *)jcr)) != 0) { - Emsg1(M_ABORT, 0, _("Unable to create job thread: ERR=%s\n"), strerror(stat)); - } -#else /* Queue the job to be run */ - if ((stat = workq_add(&job_wq, (void *)jcr, &work_item, 0)) != 0) { - Emsg1(M_ABORT, 0, _("Could not add job to work queue: ERR=%s\n"), strerror(stat)); + if ((stat = jobq_add(&job_queue, jcr)) != 0) { + berrno be; + be.set_errno(stat); + Jmsg(jcr, M_FATAL, 0, _("Could not add job queue: ERR=%s\n"), be.strerror()); + JobId = 0; + goto bail_out; } - jcr->work_item = work_item; -#endif - Dmsg0(200, "Done run_job()\n"); + Dmsg0(100, "Done run_job()\n"); + + V(jcr->mutex); + return JobId; + +bail_out: + set_jcr_job_status(jcr, JS_ErrorTerminated); + V(jcr->mutex); + return JobId; + } + /* - * This is the engine called by workq_add() when we were pulled + * This is the engine called by jobq.c:jobq_add() when we were pulled * from the work queue. - * At this point, we are running in our own thread + * At this point, we are running in our own thread and all + * necessary resources are allocated -- see jobq.c */ static void *job_thread(void *arg) { JCR *jcr = (JCR *)arg; - pthread_detach(pthread_self()); - sm_check(__FILE__, __LINE__, True); + jcr->my_thread_id = pthread_self(); + pthread_detach(jcr->my_thread_id); + sm_check(__FILE__, __LINE__, true); for ( ;; ) { - if (!acquire_resource_locks(jcr)) { - set_jcr_job_status(jcr, JS_Canceled); - } - Dmsg0(200, "=====Start Job=========\n"); jcr->start_time = time(NULL); /* set the real start time */ set_jcr_job_status(jcr, JS_Running); @@ -199,289 +197,336 @@ static void *job_thread(void *arg) BPIPE *bpipe; char line[MAXSTRING]; - before = edit_run_codes(jcr, before, jcr->job->RunBeforeJob); + before = edit_job_codes(jcr, before, jcr->job->RunBeforeJob, ""); bpipe = open_bpipe(before, 0, "r"); + free_pool_memory(before); while (fgets(line, sizeof(line), bpipe->rfd)) { Jmsg(jcr, M_INFO, 0, _("RunBefore: %s"), line); } status = close_bpipe(bpipe); if (status != 0) { - Jmsg(jcr, M_FATAL, 0, _("RunBeforeJob returned non-zero status=%d\n"), - status); + berrno be; + be.set_errno(status); + Jmsg(jcr, M_FATAL, 0, _("RunBeforeJob error: ERR=%s\n"), be.strerror()); set_jcr_job_status(jcr, JS_FatalError); update_job_end_record(jcr); - free_pool_memory(before); goto bail_out; } - free_pool_memory(before); } switch (jcr->JobType) { - case JT_BACKUP: - do_backup(jcr); - if (jcr->JobStatus == JS_Terminated) { - do_autoprune(jcr); - } - break; - case JT_VERIFY: - do_verify(jcr); - if (jcr->JobStatus == JS_Terminated) { - do_autoprune(jcr); - } - break; - case JT_RESTORE: - do_restore(jcr); - if (jcr->JobStatus == JS_Terminated) { - do_autoprune(jcr); - } - break; - case JT_ADMIN: - do_admin(jcr); - if (jcr->JobStatus == JS_Terminated) { - do_autoprune(jcr); - } - break; - default: - Pmsg1(0, "Unimplemented job type: %d\n", jcr->JobType); - break; + case JT_BACKUP: + do_backup(jcr); + if (jcr->JobStatus == JS_Terminated) { + do_autoprune(jcr); } - if (jcr->job->RunAfterJob) { + break; + case JT_VERIFY: + do_verify(jcr); + if (jcr->JobStatus == JS_Terminated) { + do_autoprune(jcr); + } + break; + case JT_RESTORE: + do_restore(jcr); + if (jcr->JobStatus == JS_Terminated) { + do_autoprune(jcr); + } + break; + case JT_ADMIN: + do_admin(jcr); + if (jcr->JobStatus == JS_Terminated) { + do_autoprune(jcr); + } + break; + default: + Pmsg1(0, "Unimplemented job type: %d\n", jcr->JobType); + break; + } + if ((jcr->job->RunAfterJob && jcr->JobStatus == JS_Terminated) || + (jcr->job->RunAfterFailedJob && jcr->JobStatus != JS_Terminated)) { POOLMEM *after = get_pool_memory(PM_FNAME); int status; BPIPE *bpipe; char line[MAXSTRING]; - after = edit_run_codes(jcr, after, jcr->job->RunAfterJob); + if (jcr->JobStatus == JS_Terminated) { + after = edit_job_codes(jcr, after, jcr->job->RunAfterJob, ""); + } else { + after = edit_job_codes(jcr, after, jcr->job->RunAfterFailedJob, ""); + } bpipe = open_bpipe(after, 0, "r"); + free_pool_memory(after); while (fgets(line, sizeof(line), bpipe->rfd)) { Jmsg(jcr, M_INFO, 0, _("RunAfter: %s"), line); } status = close_bpipe(bpipe); + /* + * Note, if we get an error here, do not mark the + * job in error, simply report the error condition. + */ if (status != 0) { - Jmsg(jcr, M_FATAL, 0, _("RunAfterJob returned non-zero status=%d\n"), - status); - set_jcr_job_status(jcr, JS_FatalError); - update_job_end_record(jcr); + berrno be; + be.set_errno(status); + if (jcr->JobStatus == JS_Terminated) { + Jmsg(jcr, M_WARNING, 0, _("RunAfterJob error: ERR=%s\n"), be.strerror()); + } else { + Jmsg(jcr, M_FATAL, 0, _("RunAfterFailedJob error: ERR=%s\n"), be.strerror()); + } } - free_pool_memory(after); } - } -bail_out: - release_resource_locks(jcr); - if (jcr->job->RescheduleOnError && - jcr->JobStatus != JS_Terminated && - jcr->JobStatus != JS_Canceled && - jcr->job->RescheduleTimes > 0 && - jcr->reschedule_count < jcr->job->RescheduleTimes) { - - /* - * Reschedule this job by cleaning it up, but - * reuse the same JobId if possible. - */ - jcr->reschedule_count++; - jcr->sched_time = time(NULL) + jcr->job->RescheduleInterval; - Dmsg2(000, "Reschedule Job %s in %d seconds.\n", jcr->Job, - (int)jcr->job->RescheduleInterval); - jcr->JobStatus = JS_Created; /* force new status */ - dird_free_jcr(jcr); /* partial cleanup old stuff */ - if (jcr->JobBytes == 0) { - continue; /* reschedule the job */ + /* Send off any queued messages */ + if (jcr->msg_queue->size() > 0) { + dequeue_messages(jcr); } - /* - * Something was actually backed up, so we cannot reuse - * the old JobId or there will be database record - * conflicts. We now create a new job, copying the - * appropriate fields. - */ - JCR *njcr = new_jcr(sizeof(JCR), dird_free_jcr); - set_jcr_defaults(njcr, jcr->job); - njcr->reschedule_count = jcr->reschedule_count; - njcr->JobLevel = jcr->JobLevel; - njcr->JobStatus = jcr->JobStatus; - njcr->pool = jcr->pool; - njcr->store = jcr->store; - njcr->messages = jcr->messages; - run_job(njcr); } +bail_out: break; } - if (jcr->db) { - Dmsg0(200, "Close DB\n"); - db_close_database(jcr, jcr->db); - jcr->db = NULL; - } - free_jcr(jcr); Dmsg0(50, "======== End Job ==========\n"); - sm_check(__FILE__, __LINE__, True); + sm_check(__FILE__, __LINE__, true); return NULL; } + /* - * Acquire the resources needed. These locks limit the - * number of jobs by each resource. We have limits on - * Jobs, Clients, Storage, and total jobs. + * Cancel a job -- typically called by the UA (Console program), but may also + * be called by the job watchdog. + * + * Returns: 1 if cancel appears to be successful + * 0 on failure. Message sent to ua->jcr. */ -static int acquire_resource_locks(JCR *jcr) +int cancel_job(UAContext *ua, JCR *jcr) { - time_t now = time(NULL); - time_t wtime = jcr->sched_time - now; - - /* Wait until scheduled time arrives */ - if (wtime > 0 && verbose) { - Jmsg(jcr, M_INFO, 0, _("Job %s waiting %d seconds for scheduled start time.\n"), - jcr->Job, wtime); - set_jcr_job_status(jcr, JS_WaitStartTime); - } - /* Check every 30 seconds if canceled */ - while (wtime > 0) { - Dmsg2(100, "Waiting on sched time, jobid=%d secs=%d\n", jcr->JobId, wtime); - if (wtime > 30) { - wtime = 30; + BSOCK *sd, *fd; + + switch (jcr->JobStatus) { + case JS_Created: + case JS_WaitJobRes: + case JS_WaitClientRes: + case JS_WaitStoreRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + set_jcr_job_status(jcr, JS_Canceled); + bsendmsg(ua, _("JobId %d, Job %s marked to be canceled.\n"), + jcr->JobId, jcr->Job); + jobq_remove(&job_queue, jcr); /* attempt to remove it from queue */ + return 1; + + default: + set_jcr_job_status(jcr, JS_Canceled); + + /* Cancel File daemon */ + if (jcr->file_bsock) { + ua->jcr->client = jcr->client; + if (!connect_to_file_daemon(ua->jcr, 10, FDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to File daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to file daemon\n"); + fd = ua->jcr->file_bsock; + bnet_fsend(fd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(fd) >= 0) { + bsendmsg(ua, "%s", fd->msg); + } + bnet_sig(fd, BNET_TERMINATE); + bnet_close(fd); + ua->jcr->file_bsock = NULL; } - bmicrosleep(wtime, 0); - if (job_canceled(jcr)) { - return 0; + + /* Cancel Storage daemon */ + if (jcr->store_bsock) { + ua->jcr->store = jcr->store; + if (!connect_to_storage_daemon(ua->jcr, 10, SDConnectTimeout, 1)) { + bsendmsg(ua, _("Failed to connect to Storage daemon.\n")); + return 0; + } + Dmsg0(200, "Connected to storage daemon\n"); + sd = ua->jcr->store_bsock; + bnet_fsend(sd, "cancel Job=%s\n", jcr->Job); + while (bnet_recv(sd) >= 0) { + bsendmsg(ua, "%s", sd->msg); + } + bnet_sig(sd, BNET_TERMINATE); + bnet_close(sd); + ua->jcr->store_bsock = NULL; } - wtime = jcr->sched_time - time(NULL); } + return 1; +} -#ifdef USE_SEMAPHORE - int stat; - /* Initialize semaphores */ - if (jcr->store->sem.valid != SEMLOCK_VALID) { - if ((stat = sem_init(&jcr->store->sem, jcr->store->MaxConcurrentJobs)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init Storage semaphore: ERR=%s\n"), strerror(stat)); - } - } - if (jcr->client->sem.valid != SEMLOCK_VALID) { - if ((stat = sem_init(&jcr->client->sem, jcr->client->MaxConcurrentJobs)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init Client semaphore: ERR=%s\n"), strerror(stat)); - } - } - if (jcr->job->sem.valid != SEMLOCK_VALID) { - if ((stat = sem_init(&jcr->job->sem, jcr->job->MaxConcurrentJobs)) != 0) { - Emsg1(M_ABORT, 0, _("Could not init Job semaphore: ERR=%s\n"), strerror(stat)); - } - } +static void job_monitor_destructor(watchdog_t *self) +{ + JCR *control_jcr = (JCR *) self->data; - for ( ;; ) { - /* Acquire semaphore */ - set_jcr_job_status(jcr, JS_WaitJobRes); - if ((stat = sem_lock(&jcr->job->sem)) != 0) { - Emsg1(M_ABORT, 0, _("Could not acquire Job max jobs lock: ERR=%s\n"), strerror(stat)); - } - set_jcr_job_status(jcr, JS_WaitClientRes); - if ((stat = sem_trylock(&jcr->client->sem)) != 0) { - if (stat == EBUSY) { - backoff_resource_locks(jcr, 1); - goto wait; - } else { - Emsg1(M_ABORT, 0, _("Could not acquire Client max jobs lock: ERR=%s\n"), strerror(stat)); - } - } - set_jcr_job_status(jcr, JS_WaitStoreRes); - if ((stat = sem_trylock(&jcr->store->sem)) != 0) { - if (stat == EBUSY) { - backoff_resource_locks(jcr, 2); - goto wait; - } else { - Emsg1(M_ABORT, 0, _("Could not acquire Storage max jobs lock: ERR=%s\n"), strerror(stat)); - } - } - set_jcr_job_status(jcr, JS_WaitMaxJobs); - if ((stat = sem_trylock(&job_lock)) != 0) { - if (stat == EBUSY) { - backoff_resource_locks(jcr, 3); - goto wait; - } else { - Emsg1(M_ABORT, 0, _("Could not acquire max jobs lock: ERR=%s\n"), strerror(stat)); - } + free_jcr(control_jcr); +} + +static void job_monitor_watchdog(watchdog_t *self) +{ + JCR *control_jcr, *jcr; + + control_jcr = (JCR *)self->data; + + Dmsg1(400, "job_monitor_watchdog %p called\n", self); + + lock_jcr_chain(); + + foreach_jcr(jcr) { + bool cancel; + + if (jcr->JobId == 0) { + Dmsg2(400, "Skipping JCR %p (%s) with JobId 0\n", + jcr, jcr->Job); + /* Keep reference counts correct */ + free_locked_jcr(jcr); + continue; } - break; -wait: - if (job_canceled(jcr)) { - return 0; + /* check MaxWaitTime */ + cancel = job_check_maxwaittime(control_jcr, jcr); + + /* check MaxRunTime */ + cancel |= job_check_maxruntime(control_jcr, jcr); + + if (cancel) { + Dmsg3(200, "Cancelling JCR %p jobid %d (%s)\n", + jcr, jcr->JobId, jcr->Job); + + UAContext *ua = new_ua_context(jcr); + ua->jcr = control_jcr; + cancel_job(ua, jcr); + free_ua_context(ua); + + Dmsg1(200, "Have cancelled JCR %p\n", jcr); } - P(mutex); - /* - * Wait for a resource to be released either by backoff or - * by a job terminating. - */ - waiting++; - pthread_cond_wait(&resource_wait, &mutex); - waiting--; - V(mutex); - /* Try again */ - } - jcr->acquired_resource_locks = 1; -#endif - return 1; + + /* Keep reference counts correct */ + free_locked_jcr(jcr); + } + unlock_jcr_chain(); } -#ifdef USE_SEMAPHORE /* - * We could not get all the resource locks because - * too many jobs are running, so release any locks - * we did acquire, giving others a chance to use them - * while we wait. + * Check if the maxwaittime has expired and it is possible + * to cancel the job. */ -static void backoff_resource_locks(JCR *jcr, int count) +static bool job_check_maxwaittime(JCR *control_jcr, JCR *jcr) { - P(mutex); - switch (count) { - case 3: - sem_unlock(&jcr->store->sem); - /* Fall through wanted */ - case 2: - sem_unlock(&jcr->client->sem); - /* Fall through wanted */ - case 1: - sem_unlock(&jcr->job->sem); + bool cancel = false; + + if (jcr->job->MaxWaitTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxWaitTime) { + Dmsg3(200, "Job %p (%s) with MaxWaitTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxWaitTime); + return false; + } + Dmsg3(200, "Job %d (%s): MaxWaitTime of %d seconds exceeded, " + "checking status\n", + jcr->JobId, jcr->Job, jcr->job->MaxWaitTime); + switch (jcr->JobStatus) { + case JS_Created: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + cancel = true; + Dmsg0(200, "JCR blocked in #1\n"); break; + case JS_Running: + Dmsg0(200, "JCR running, checking SD status\n"); + switch (jcr->SDJobStatus) { + case JS_WaitMount: + case JS_WaitMedia: + case JS_WaitFD: + cancel = true; + Dmsg0(200, "JCR blocked in #2\n"); + break; + default: + Dmsg0(200, "JCR not blocked in #2\n"); + break; + } + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + Dmsg0(200, "JCR already dead in #3\n"); + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); } - /* - * Since we released a lock, if there are any threads - * waiting, wake them up so that they can try again. - */ - if (waiting > 0) { - pthread_cond_broadcast(&resource_wait); - } - V(mutex); + Dmsg3(200, "MaxWaitTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; } -#endif /* - * This is called at the end of the job to release - * any resource limits on the number of jobs. If - * there are any other jobs waiting, we wake them - * up so that they can try again. + * Check if maxruntime has expired and if the job can be + * canceled. */ -static void release_resource_locks(JCR *jcr) +static bool job_check_maxruntime(JCR *control_jcr, JCR *jcr) { - if (!jcr->acquired_resource_locks) { - return; /* Job canceled, no locks acquired */ - } -#ifdef USE_SEMAPHORE - P(mutex); - sem_unlock(&jcr->store->sem); - sem_unlock(&jcr->client->sem); - sem_unlock(&jcr->job->sem); - sem_unlock(&job_lock); - if (waiting > 0) { - pthread_cond_broadcast(&resource_wait); - } - jcr->acquired_resource_locks = 0; - V(mutex); -#endif + bool cancel = false; + + if (jcr->job->MaxRunTime == 0) { + return false; + } + if ((watchdog_time - jcr->start_time) < jcr->job->MaxRunTime) { + Dmsg3(200, "Job %p (%s) with MaxRunTime %d not expired\n", + jcr, jcr->Job, jcr->job->MaxRunTime); + return false; + } + + switch (jcr->JobStatus) { + case JS_Created: + case JS_Running: + case JS_Blocked: + case JS_WaitFD: + case JS_WaitSD: + case JS_WaitStoreRes: + case JS_WaitClientRes: + case JS_WaitJobRes: + case JS_WaitPriority: + case JS_WaitMaxJobs: + case JS_WaitStartTime: + case JS_Differences: + cancel = true; + break; + case JS_Terminated: + case JS_ErrorTerminated: + case JS_Canceled: + case JS_FatalError: + cancel = false; + break; + default: + Jmsg1(jcr, M_ERROR, 0, _("Unhandled job status code %d\n"), + jcr->JobStatus); + } + + Dmsg3(200, "MaxRunTime result: %scancel JCR %p (%s)\n", + cancel ? "" : "do not ", jcr, jcr->job); + + return cancel; } + /* * Get or create a Client record for this Job */ -int get_or_create_client_record(JCR *jcr) +bool get_or_create_client_record(JCR *jcr) { CLIENT_DBR cr; @@ -493,33 +538,75 @@ int get_or_create_client_record(JCR *jcr) if (!jcr->client_name) { jcr->client_name = get_pool_memory(PM_NAME); } - pm_strcpy(&jcr->client_name, jcr->client->hdr.name); + pm_strcpy(jcr->client_name, jcr->client->hdr.name); if (!db_create_client_record(jcr, jcr->db, &cr)) { Jmsg(jcr, M_FATAL, 0, _("Could not create Client record. ERR=%s\n"), db_strerror(jcr->db)); - return 0; + return false; } jcr->jr.ClientId = cr.ClientId; if (cr.Uname[0]) { if (!jcr->client_uname) { jcr->client_uname = get_pool_memory(PM_NAME); } - pm_strcpy(&jcr->client_uname, cr.Uname); + pm_strcpy(jcr->client_uname, cr.Uname); } Dmsg2(100, "Created Client %s record %d\n", jcr->client->hdr.name, jcr->jr.ClientId); - return 1; + return true; } +bool get_or_create_fileset_record(JCR *jcr, FILESET_DBR *fsr) +{ + /* + * Get or Create FileSet record + */ + memset(fsr, 0, sizeof(FILESET_DBR)); + bstrncpy(fsr->FileSet, jcr->fileset->hdr.name, sizeof(fsr->FileSet)); + if (jcr->fileset->have_MD5) { + struct MD5Context md5c; + unsigned char signature[16]; + memcpy(&md5c, &jcr->fileset->md5c, sizeof(md5c)); + MD5Final(signature, &md5c); + bin_to_base64(fsr->MD5, (char *)signature, 16); /* encode 16 bytes */ + bstrncpy(jcr->fileset->MD5, fsr->MD5, sizeof(jcr->fileset->MD5)); + } else { + Jmsg(jcr, M_WARNING, 0, _("FileSet MD5 signature not found.\n")); + } + if (!db_create_fileset_record(jcr, jcr->db, fsr)) { + Jmsg(jcr, M_ERROR, 0, _("Could not create FileSet \"%s\" record. ERR=%s\n"), + fsr->FileSet, db_strerror(jcr->db)); + return false; + } + jcr->jr.FileSetId = fsr->FileSetId; + if (fsr->created) { + Jmsg(jcr, M_INFO, 0, _("Created new FileSet record \"%s\" %s\n"), + fsr->FileSet, fsr->cCreateTime); + } + Dmsg2(119, "Created FileSet %s record %u\n", jcr->fileset->hdr.name, + jcr->jr.FileSetId); + return true; +} + +void init_jcr_job_record(JCR *jcr) +{ + jcr->jr.SchedTime = jcr->sched_time; + jcr->jr.StartTime = jcr->start_time; + jcr->jr.EndTime = 0; /* perhaps rescheduled, clear it */ + jcr->jr.JobType = jcr->JobType; + jcr->jr.JobLevel = jcr->JobLevel; + jcr->jr.JobStatus = jcr->JobStatus; + jcr->jr.JobId = jcr->JobId; + bstrncpy(jcr->jr.Name, jcr->job->hdr.name, sizeof(jcr->jr.Name)); + bstrncpy(jcr->jr.Job, jcr->Job, sizeof(jcr->jr.Job)); +} /* * Write status and such in DB */ void update_job_end_record(JCR *jcr) { - if (jcr->jr.EndTime == 0) { - jcr->jr.EndTime = time(NULL); - } + jcr->jr.EndTime = time(NULL); jcr->end_time = jcr->jr.EndTime; jcr->jr.JobId = jcr->JobId; jcr->jr.JobStatus = jcr->JobStatus; @@ -540,7 +627,7 @@ void update_job_end_record(JCR *jcr) * Returns: unique job name in jcr->Job * date/time in jcr->start_time */ -void create_unique_job_name(JCR *jcr, char *base_name) +void create_unique_job_name(JCR *jcr, const char *base_name) { /* Job start mutex */ static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; @@ -623,6 +710,10 @@ void dird_free_jcr(JCR *jcr) free_pool_memory(jcr->client_uname); jcr->client_uname = NULL; } + if (jcr->term_wait_inited) { + pthread_cond_destroy(&jcr->term_wait); + } + jcr->job_end_push.destroy(); Dmsg0(200, "End dird free_jcr\n"); } @@ -637,19 +728,33 @@ void set_jcr_defaults(JCR *jcr, JOB *job) { jcr->job = job; jcr->JobType = job->JobType; - jcr->JobLevel = job->level; + switch (jcr->JobType) { + case JT_ADMIN: + case JT_RESTORE: + jcr->JobLevel = L_NONE; + break; + default: + jcr->JobLevel = job->level; + break; + } + jcr->JobPriority = job->Priority; jcr->store = job->storage; jcr->client = job->client; if (!jcr->client_name) { jcr->client_name = get_pool_memory(PM_NAME); } - pm_strcpy(&jcr->client_name, jcr->client->hdr.name); + pm_strcpy(jcr->client_name, jcr->client->hdr.name); jcr->pool = job->pool; + jcr->full_pool = job->full_pool; + jcr->inc_pool = job->inc_pool; + jcr->dif_pool = job->dif_pool; jcr->catalog = job->client->catalog; jcr->fileset = job->fileset; jcr->messages = job->messages; + jcr->spool_data = job->spool_data; if (jcr->RestoreBootstrap) { free(jcr->RestoreBootstrap); + jcr->RestoreBootstrap = NULL; } /* This can be overridden by Console program */ if (job->RestoreBootstrap) { @@ -666,87 +771,10 @@ void set_jcr_defaults(JCR *jcr, JOB *job) break; case JT_RESTORE: case JT_ADMIN: - jcr->JobLevel = L_FULL; + jcr->JobLevel = L_NONE; break; default: break; } } } - -/* - * Edit codes into Run command - * %% = % - * %c = Client's name - * %d = Director's name - * %i = JobId - * %e = Job Exit - * %j = Job - * %l = Job Level - * %n = Job name - * %t = Job type - * - * omsg = edited output message - * imsg = input string containing edit codes (%x) - * - */ -static char *edit_run_codes(JCR *jcr, char *omsg, char *imsg) -{ - char *p; - const char *str; - char add[20]; - - *omsg = 0; - Dmsg1(200, "edit_run_codes: %s\n", imsg); - for (p=imsg; *p; p++) { - if (*p == '%') { - switch (*++p) { - case '%': - str = "%"; - break; - case 'c': - str = jcr->client_name; - if (!str) { - str = ""; - } - break; - case 'd': - str = my_name; - break; - case 'e': - str = job_status_to_str(jcr->JobStatus); - break; - case 'i': - sprintf(add, "%d", jcr->JobId); - str = add; - break; - case 'j': /* Job */ - str = jcr->Job; - break; - case 'l': - str = job_level_to_str(jcr->JobLevel); - break; - case 'n': - str = jcr->job->hdr.name; - break; - case 't': - str = job_type_to_str(jcr->JobType); - break; - default: - add[0] = '%'; - add[1] = *p; - add[2] = 0; - str = add; - break; - } - } else { - add[0] = *p; - add[1] = 0; - str = add; - } - Dmsg1(200, "add_str %s\n", str); - pm_strcat(&omsg, (char *)str); - Dmsg1(200, "omsg=%s\n", omsg); - } - return omsg; -}