*
*/
/*
- Copyright (C) 2000, 2001, 2002 Kern Sibbald and John Walker
+ Copyright (C) 2000-2003 Kern Sibbald and John Walker
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License as
#include "bacula.h"
#include "jcr.h"
-struct s_last_job last_job; /* last job run by this daemon */
+/* External variables we reference */
+extern time_t watchdog_time;
+
+/* Forward referenced functions */
+static void timeout_handler(int sig);
+static void jcr_timeout_check(watchdog_t *self);
+
+struct s_last_job last_job; /* last job run by this daemon */
+dlist *last_jobs = NULL;
+#define MAX_LAST_JOBS 10
static JCR *jobs = NULL; /* pointer to JCR chain */
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Mutex for locking various jcr chains while updating */
+static pthread_mutex_t jcr_chain_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void init_last_jobs_list()
+{
+ struct s_last_job *job_entry;
+ if (!last_jobs) {
+ last_jobs = new dlist(job_entry, &job_entry->link);
+ memset(&last_job, 0, sizeof(last_job));
+ }
+}
+
+void term_last_jobs_list()
+{
+ char *je;
+ if (last_jobs) {
+ foreach_dlist(je, last_jobs) {
+ free(je);
+ }
+ delete last_jobs;
+ last_jobs = NULL;
+ }
+}
+
+void lock_last_jobs_list()
+{
+ /* Use jcr chain mutex */
+ P(jcr_chain_mutex);
+}
+
+void unlock_last_jobs_list()
+{
+ /* Use jcr chain mutex */
+ V(jcr_chain_mutex);
+}
/*
* Create a Job Control Record and link it into JCR chain
JCR *new_jcr(int size, JCR_free_HANDLER *daemon_free_jcr)
{
JCR *jcr;
+ struct sigaction sigtimer;
Dmsg0(200, "Enter new_jcr\n");
- jcr = (JCR *) malloc(size);
+ jcr = (JCR *)malloc(size);
memset(jcr, 0, size);
jcr->my_thread_id = pthread_self();
jcr->sched_time = time(NULL);
jcr->VolumeName[0] = 0;
jcr->errmsg = get_pool_memory(PM_MESSAGE);
jcr->errmsg[0] = 0;
+ strcpy(jcr->Job, "*Console*"); /* default */
- P(mutex);
+ sigtimer.sa_flags = 0;
+ sigtimer.sa_handler = timeout_handler;
+ sigfillset(&sigtimer.sa_mask);
+ sigaction(TIMEOUT_SIGNAL, &sigtimer, NULL);
+
+ P(jcr_chain_mutex);
jcr->prev = NULL;
jcr->next = jobs;
if (jobs) {
jobs->prev = jcr;
}
jobs = jcr;
- V(mutex);
+ V(jcr_chain_mutex);
return jcr;
}
}
/*
- * Free stuff common to all JCRs
+ * Free stuff common to all JCRs. N.B. Be careful to include only
+ * generic stuff in the common part of the jcr.
*/
static void free_common_jcr(JCR *jcr)
{
/* Keep some statistics */
switch (jcr->JobType) {
- case JT_BACKUP:
- case JT_VERIFY:
- case JT_RESTORE:
- last_job.NumJobs++;
- last_job.JobType = jcr->JobType;
- last_job.JobId = jcr->JobId;
- last_job.VolSessionId = jcr->VolSessionId;
- last_job.VolSessionTime = jcr->VolSessionTime;
- strcpy(last_job.Job, jcr->Job);
- last_job.JobFiles = jcr->JobFiles;
- last_job.JobBytes = jcr->JobBytes;
- last_job.JobStatus = jcr->JobStatus;
- last_job.start_time = jcr->start_time;
- last_job.end_time = time(NULL);
- break;
- default:
- break;
+ case JT_BACKUP:
+ case JT_VERIFY:
+ case JT_RESTORE:
+ case JT_ADMIN:
+ last_job.NumJobs++;
+ last_job.JobType = jcr->JobType;
+ last_job.JobId = jcr->JobId;
+ last_job.VolSessionId = jcr->VolSessionId;
+ last_job.VolSessionTime = jcr->VolSessionTime;
+ bstrncpy(last_job.Job, jcr->Job, sizeof(last_job.Job));
+ last_job.JobFiles = jcr->JobFiles;
+ last_job.JobBytes = jcr->JobBytes;
+ last_job.JobStatus = jcr->JobStatus;
+ last_job.JobLevel = jcr->JobLevel;
+ last_job.start_time = jcr->start_time;
+ last_job.end_time = time(NULL);
+ break;
+ default:
+ break;
}
pthread_mutex_destroy(&jcr->mutex);
}
if (jcr->sd_auth_key) {
- Dmsg0(200, "Free JCR sd_auth_key\n");
free(jcr->sd_auth_key);
jcr->sd_auth_key = NULL;
}
free_pool_memory(jcr->errmsg);
jcr->errmsg = NULL;
}
+ if (jcr->where) {
+ free(jcr->where);
+ jcr->where = NULL;
+ }
+ if (jcr->cached_path) {
+ free_pool_memory(jcr->cached_path);
+ jcr->cached_path = NULL;
+ jcr->cached_pnl = 0;
+ }
+ free_getuser_cache();
+ free_getgroup_cache();
free(jcr);
}
void free_jcr(JCR *jcr)
{
+
Dmsg1(200, "Enter free_jcr 0x%x\n", jcr);
#endif
+ struct s_last_job *je;
- P(mutex);
+ P(jcr_chain_mutex);
jcr->use_count--; /* decrement use count */
- Dmsg2(200, "Decrement jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ Dmsg3(200, "Dec free_jcr 0x%x use_count=%d jobid=%d\n", jcr, jcr->use_count, jcr->JobId);
if (jcr->use_count > 0) { /* if in use */
- V(mutex);
- Dmsg2(200, "jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ V(jcr_chain_mutex);
+ Dmsg2(200, "free_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
return;
}
remove_jcr(jcr);
- V(mutex);
- jcr->daemon_free_jcr(jcr); /* call daemon free routine */
+ Dmsg1(200, "End job=%d\n", jcr->JobId);
+ if (jcr->daemon_free_jcr) {
+ jcr->daemon_free_jcr(jcr); /* call daemon free routine */
+ }
+
free_common_jcr(jcr);
+
+ /* Keep list of last jobs, but not Console where JobId==0 */
+ if (last_job.JobId > 0) {
+ je = (struct s_last_job *)malloc(sizeof(struct s_last_job));
+ memcpy((char *)je, (char *)&last_job, sizeof(last_job));
+ if (!last_jobs) {
+ init_last_jobs_list();
+ }
+ last_jobs->append(je);
+ if (last_jobs->size() > MAX_LAST_JOBS) {
+ last_jobs->remove(last_jobs->first());
+ }
+ last_job.JobId = 0; /* zap last job */
+ }
+ close_msg(NULL); /* flush any daemon messages */
+ V(jcr_chain_mutex);
Dmsg0(200, "Exit free_jcr\n");
}
void free_locked_jcr(JCR *jcr)
{
jcr->use_count--; /* decrement use count */
- Dmsg2(200, "Decrement jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ Dmsg2(200, "Dec free_locked_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
if (jcr->use_count > 0) { /* if in use */
return;
}
{
JCR *jcr;
- P(mutex);
+ P(jcr_chain_mutex); /* lock chain */
for (jcr = jobs; jcr; jcr=jcr->next) {
if (jcr->JobId == JobId) {
+ P(jcr->mutex);
jcr->use_count++;
- Dmsg2(200, "Increment jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ V(jcr->mutex);
+ Dmsg2(200, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
break;
}
}
- V(mutex);
+ V(jcr_chain_mutex);
return jcr;
}
{
JCR *jcr;
- P(mutex);
+ P(jcr_chain_mutex);
for (jcr = jobs; jcr; jcr=jcr->next) {
if (jcr->VolSessionId == SessionId &&
jcr->VolSessionTime == SessionTime) {
+ P(jcr->mutex);
jcr->use_count++;
- Dmsg2(200, "Increment jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ V(jcr->mutex);
+ Dmsg2(200, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
break;
}
}
- V(mutex);
+ V(jcr_chain_mutex);
return jcr;
}
JCR *jcr;
int len;
- P(mutex);
+ if (!Job) {
+ return NULL;
+ }
+ P(jcr_chain_mutex);
len = strlen(Job);
for (jcr = jobs; jcr; jcr=jcr->next) {
if (strncmp(Job, jcr->Job, len) == 0) {
+ P(jcr->mutex);
jcr->use_count++;
- Dmsg2(200, "Increment jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ V(jcr->mutex);
+ Dmsg2(200, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
break;
}
}
- V(mutex);
+ V(jcr_chain_mutex);
return jcr;
}
{
JCR *jcr;
- P(mutex);
+ if (!Job) {
+ return NULL;
+ }
+ P(jcr_chain_mutex);
for (jcr = jobs; jcr; jcr=jcr->next) {
if (strcmp(jcr->Job, Job) == 0) {
+ P(jcr->mutex);
jcr->use_count++;
- Dmsg2(200, "Increment jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
+ V(jcr->mutex);
+ Dmsg2(200, "Inc get_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
break;
}
}
- V(mutex);
+ V(jcr_chain_mutex);
return jcr;
}
+void set_jcr_job_status(JCR *jcr, int JobStatus)
+{
+ /*
+ * For a set of errors, ... keep the current status
+ * so it isn't lost. For all others, set it.
+ */
+ switch (jcr->JobStatus) {
+ case JS_ErrorTerminated:
+ case JS_Error:
+ case JS_FatalError:
+ case JS_Differences:
+ case JS_Canceled:
+ break;
+ default:
+ jcr->JobStatus = JobStatus;
+ }
+}
+
/*
* Lock the chain
*/
void lock_jcr_chain()
{
- P(mutex);
+ P(jcr_chain_mutex);
}
/*
*/
void unlock_jcr_chain()
{
- V(mutex);
+ V(jcr_chain_mutex);
}
-JCR *get_next_jcr(JCR *jcr)
+JCR *get_next_jcr(JCR *prev_jcr)
{
- JCR *rjcr;
+ JCR *jcr;
- if (jcr == NULL) {
- rjcr = jobs;
+ if (prev_jcr == NULL) {
+ jcr = jobs;
} else {
- rjcr = jcr->next;
+ jcr = prev_jcr->next;
+ }
+ if (jcr) {
+ P(jcr->mutex);
+ jcr->use_count++;
+ V(jcr->mutex);
+ Dmsg2(200, "Inc get_next_jcr 0x%x use_count=%d\n", jcr, jcr->use_count);
}
- if (rjcr) {
- rjcr->use_count++;
- Dmsg1(200, "Increment jcr use_count=%d\n", rjcr->use_count);
+ return jcr;
+}
+
+bool init_jcr_subsystem(void)
+{
+ watchdog_t *wd = new_watchdog();
+
+ wd->one_shot = false;
+ wd->interval = 30; /* FIXME: should be configurable somewhere, even
+ if only with a #define */
+ wd->callback = jcr_timeout_check;
+
+ register_watchdog(wd);
+
+ return true;
+}
+
+static void jcr_timeout_check(watchdog_t *self)
+{
+ JCR *jcr;
+ BSOCK *fd;
+ time_t timer_start;
+
+ Dmsg0(400, "Start JCR timeout checks\n");
+
+ /* Walk through all JCRs checking if any one is
+ * blocked for more than specified max time.
+ */
+ lock_jcr_chain();
+ for (jcr=NULL; (jcr=get_next_jcr(jcr)); ) {
+ free_locked_jcr(jcr); /* OK to free now cuz chain is locked */
+ if (jcr->JobId == 0) {
+ continue;
+ }
+ fd = jcr->store_bsock;
+ if (fd) {
+ timer_start = fd->timer_start;
+ if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
+ fd->timer_start = 0; /* turn off timer */
+ fd->timed_out = TRUE;
+ Jmsg(jcr, M_ERROR, 0, _(
+"Watchdog sending kill after %d secs to thread stalled reading Storage daemon.\n"),
+ watchdog_time - timer_start);
+ pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+ }
+ }
+ fd = jcr->file_bsock;
+ if (fd) {
+ timer_start = fd->timer_start;
+ if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
+ fd->timer_start = 0; /* turn off timer */
+ fd->timed_out = TRUE;
+ Jmsg(jcr, M_ERROR, 0, _(
+"Watchdog sending kill after %d secs to thread stalled reading File daemon.\n"),
+ watchdog_time - timer_start);
+ pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+ }
+ }
+ fd = jcr->dir_bsock;
+ if (fd) {
+ timer_start = fd->timer_start;
+ if (timer_start && (watchdog_time - timer_start) > fd->timeout) {
+ fd->timer_start = 0; /* turn off timer */
+ fd->timed_out = TRUE;
+ Jmsg(jcr, M_ERROR, 0, _(
+"Watchdog sending kill after %d secs to thread stalled reading Director.\n"),
+ watchdog_time - timer_start);
+ pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
+ }
+ }
+
}
- return rjcr;
+ unlock_jcr_chain();
+
+ Dmsg0(200, "Finished JCR timeout checks\n");
+}
+
+/*
+ * Timeout signal comes here
+ */
+void timeout_handler(int sig)
+{
+ return; /* thus interrupting the function */
}