]> git.sur5r.net Git - bacula/bacula/commitdiff
Fix cancel crash bug #1551
authorEric Bollengier <eric@eb.homelinux.org>
Fri, 23 Apr 2010 06:15:51 +0000 (08:15 +0200)
committerEric Bollengier <eric@eb.homelinux.org>
Mon, 2 Aug 2010 14:53:44 +0000 (16:53 +0200)
bacula/src/dird/job.c
bacula/src/dird/jobq.c
bacula/src/dird/msgchan.c
bacula/src/dird/protos.h
bacula/src/filed/job.c
bacula/src/jcr.h
bacula/src/lib/jcr.c
bacula/src/lib/protos.h
bacula/src/stored/stored.c

index 1595f56c180ead5c237a3ea2e70d35d8596b2aa8..88ac5b84e46b0ba299998af94cf0abb20f67c103 100644 (file)
@@ -364,6 +364,18 @@ static void *job_thread(void *arg)
    return NULL;
 }
 
+void sd_msg_thread_send_signal(JCR *jcr, int sig)
+{
+   jcr->lock();
+   if (  !jcr->sd_msg_thread_done
+       && jcr->SD_msg_chan 
+       && !pthread_equal(jcr->SD_msg_chan, pthread_self()))
+   {
+      Dmsg1(800, "Send kill to SD msg chan jid=%d\n", jcr->JobId);
+      pthread_kill(jcr->SD_msg_chan, sig);
+   }
+   jcr->unlock();
+}
 
 /*
  * Cancel a job -- typically called by the UA (Console program), but may also
@@ -411,10 +423,7 @@ bool cancel_job(UAContext *ua, JCR *jcr)
          fd->close();
          ua->jcr->file_bsock = NULL;
          jcr->file_bsock->set_terminated();
-         if (jcr->my_thread_id && !pthread_equal(jcr->my_thread_id, pthread_self())) {
-            pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
-            Dmsg1(800, "Send kill to jid=%d\n", jcr->JobId);
-         }
+         jcr->my_thread_send_signal(TIMEOUT_SIGNAL);
       }
 
       /* Cancel Storage daemon */
@@ -450,13 +459,8 @@ bool cancel_job(UAContext *ua, JCR *jcr)
          ua->jcr->store_bsock = NULL;
          jcr->store_bsock->set_timed_out();
          jcr->store_bsock->set_terminated();
-         if (jcr->SD_msg_chan && !pthread_equal(jcr->SD_msg_chan, pthread_self())) {
-            Dmsg2(400, "kill jobid=%d use=%d\n", (int)jcr->JobId, jcr->use_count());
-            pthread_kill(jcr->SD_msg_chan, TIMEOUT_SIGNAL);
-         }
-         if (jcr->my_thread_id && !pthread_equal(jcr->my_thread_id, pthread_self())) {
-            pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
-         }
+         sd_msg_thread_send_signal(jcr, TIMEOUT_SIGNAL);
+         jcr->my_thread_send_signal(TIMEOUT_SIGNAL);
       }
       break;
    }
@@ -506,13 +510,8 @@ void cancel_storage_daemon_job(JCR *jcr)
       jcr->sd_canceled = true;
       jcr->store_bsock->set_timed_out();
       jcr->store_bsock->set_terminated();
-      if (jcr->SD_msg_chan && !pthread_equal(jcr->SD_msg_chan, pthread_self())) {
-         Dmsg2(400, "kill jobid=%d use=%d\n", (int)jcr->JobId, jcr->use_count());
-         pthread_kill(jcr->SD_msg_chan, TIMEOUT_SIGNAL);
-      }
-      if (jcr->my_thread_id && !pthread_equal(jcr->my_thread_id, pthread_self())) {
-         pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
-      }
+      sd_msg_thread_send_signal(jcr, TIMEOUT_SIGNAL);
+      jcr->my_thread_send_signal(TIMEOUT_SIGNAL);
    }
 bail_out:
    free_jcr(control_jcr);
index 68d67dc9cf66c229e6bd54d15ed9233fcdd9fdf2..c324d45111b880d5f7796fcc55553d97c39c0aa2 100644 (file)
@@ -450,7 +450,7 @@ void *jobq_server(void *arg)
          jq->engine(je->jcr);
 
          /* Job finished detach from thread */
-         set_jcr_in_tsd(INVALID_JCR);
+         remove_jcr_from_tsd(je->jcr);
 
          Dmsg2(2300, "Back from user engine jobid=%d use=%d.\n", jcr->JobId,
             jcr->use_count());
index 4b653d3790c77c37378783f023f018a2fcb7cc89..f354fb3b457eeed83dd53ff03a3027e12677392c 100644 (file)
@@ -354,8 +354,10 @@ extern "C" void msg_thread_cleanup(void *arg)
 {
    JCR *jcr = (JCR *)arg;
    db_end_transaction(jcr, jcr->db);       /* terminate any open transaction */
+   jcr->lock();
    jcr->sd_msg_thread_done = true;
    jcr->SD_msg_chan = 0;
+   jcr->unlock();
    pthread_cond_broadcast(&jcr->term_wait); /* wakeup any waiting threads */
    Dmsg2(100, "=== End msg_thread. JobId=%d usecnt=%d\n", jcr->JobId, jcr->use_count());
    free_jcr(jcr);                     /* release jcr */
@@ -377,7 +379,7 @@ extern "C" void *msg_thread(void *arg)
    uint64_t JobBytes;
 
    pthread_detach(pthread_self());
-   set_jcr_in_tsd(jcr);
+   set_jcr_in_tsd(jcr, false /* no thread update in jcr */);
    jcr->SD_msg_chan = pthread_self();
    pthread_cleanup_push(msg_thread_cleanup, arg);
    sd = jcr->store_bsock;
@@ -427,8 +429,7 @@ void wait_for_storage_daemon_termination(JCR *jcr)
          if (jcr->SD_msg_chan) {
             jcr->store_bsock->set_timed_out();
             jcr->store_bsock->set_terminated();
-            Dmsg2(400, "kill jobid=%d use=%d\n", (int)jcr->JobId, jcr->use_count());
-            pthread_kill(jcr->SD_msg_chan, TIMEOUT_SIGNAL);
+            sd_msg_thread_send_signal(jcr, TIMEOUT_SIGNAL);
          }
          cancel_count++;
       }
index ee7bef42deffc969d7e6d3cb9a4a01ff62ce0b0d..25882479ac10f1918055caafe4adce23f27cd5af 100644 (file)
@@ -141,6 +141,7 @@ extern void dird_free_jcr(JCR *jcr);
 extern void dird_free_jcr_pointers(JCR *jcr);
 extern void cancel_storage_daemon_job(JCR *jcr);
 extern bool run_console_command(JCR *jcr, const char *cmd);
+extern void sd_msg_thread_send_signal(JCR *jcr, int sig);
 
 /* migration.c */
 extern bool do_migration(JCR *jcr);
index 2a9dca806fd6a30f0a3f6e6b45cea4b883f2c6f2..b130670e7815e83f063fd429ea6d24f7b2c0d7de 100644 (file)
@@ -428,7 +428,7 @@ static int cancel_cmd(JCR *jcr)
          if (cjcr->store_bsock) {
             cjcr->store_bsock->set_timed_out();
             cjcr->store_bsock->set_terminated();
-            pthread_kill(cjcr->my_thread_id, TIMEOUT_SIGNAL);
+            cjcr->my_thread_send_signal(TIMEOUT_SIGNAL);
          }
          generate_plugin_event(cjcr, bEventCancelCommand, NULL);
          set_jcr_job_status(cjcr, JS_Canceled);
index 9db901ad6767c3826c1a8e55e517dd6f74029076..2d38956798584304b67e5a975f526a9603f31006 100644 (file)
@@ -202,10 +202,11 @@ public:
    const char *get_ActionName(bool past); /* in lib/jcr.c */
    void setJobStatus(int JobStatus);      /* in lib/jcr.c */
    bool JobReads();                       /* in lib/jcr.c */
-   
+   void my_thread_send_signal(int sig);   /* in lib/jcr.c */
 
    /* Global part of JCR common to all daemons */
    dlink link;                        /* JCR chain link */
+   bool my_thread_running;            /* is the thread controlling jcr running*/
    pthread_t my_thread_id;            /* id of thread controlling jcr */
    BSOCK *dir_bsock;                  /* Director bsock or NULL if we are him */
    BSOCK *store_bsock;                /* Storage connection socket */
index 988566ecfd3fc64d8d33a2a9a750d5ab720be20e..441f5a9f98be9f9101d08c2a4ea1ebbaf6c1b817 100644 (file)
@@ -342,7 +342,6 @@ JCR *new_jcr(int size, JCR_free_HANDLER *daemon_free_jcr)
    }
    jcr = (JCR *)malloc(size);
    memset(jcr, 0, size);
-   jcr->my_thread_id = pthread_self();
    jcr->msg_queue = New(dlist(item, &item->link));
    if ((status = pthread_mutex_init(&jcr->msg_queue_mutex, NULL)) != 0) {
       berrno be;
@@ -412,6 +411,9 @@ static void remove_jcr(JCR *jcr)
  */
 static void free_common_jcr(JCR *jcr)
 {
+   /* Uses jcr lock/unlock */
+   remove_jcr_from_tsd(jcr);
+
    jcr->destroy_mutex();
 
    if (jcr->msg_queue) {
@@ -475,7 +477,6 @@ static void free_common_jcr(JCR *jcr)
       free_pool_memory(jcr->comment);
       jcr->comment = NULL;
    }
-   remove_jcr_from_tsd(jcr);
    free(jcr);
 }
 
@@ -587,20 +588,54 @@ void remove_jcr_from_tsd(JCR *jcr)
 {
    JCR *tjcr = get_jcr_from_tsd();
    if (tjcr == jcr) { 
+      jcr->lock();
+      jcr->my_thread_running = false;
+      memset(&jcr->my_thread_id, 0, sizeof(jcr->my_thread_id));
+      jcr->unlock();
       set_jcr_in_tsd(INVALID_JCR);
    }
 }
 
 /*
- * Put this jcr in the thread specifc data 
+ * Put this jcr in the thread specifc data
+ *  if update_thread_info is true and the jcr is valide,
+ *  we update the my_thread_id in the JCR
  */
-void set_jcr_in_tsd(JCR *jcr)
+void set_jcr_in_tsd(JCR *jcr, bool update_thread_info)
 {
    int status = pthread_setspecific(jcr_key, (void *)jcr);
    if (status != 0) {
       berrno be;
-      Jmsg1(jcr, M_ABORT, 0, _("pthread_setspecific failed: ERR=%s\n"), be.bstrerror(status));
+      Jmsg1(jcr, M_ABORT, 0, _("pthread_setspecific failed: ERR=%s\n"), 
+            be.bstrerror(status));
+   }
+
+   /* We explicitly ask to set a jcr in tsd, we can update jcr->my_thread
+    */
+   if (update_thread_info && jcr && jcr != INVALID_JCR) {
+      Dmsg2(100, "setting my_thread_stuffs 0x%p => 0x%p\n", 
+            jcr->my_thread_id, pthread_self());
+      jcr->lock();
+      //ASSERT(jcr->my_thread_running == false);
+      jcr->my_thread_id = pthread_self();
+      jcr->my_thread_running = true;
+      jcr->unlock();
+   }
+}
+
+void JCR::my_thread_send_signal(int sig)
+{
+   this->lock();
+   if (   this->my_thread_running 
+       && !pthread_equal(this->my_thread_id, pthread_self()))
+   {
+      Dmsg1(800, "Send kill to jid=%d\n", this->JobId);
+      pthread_kill(this->my_thread_id, sig);
+
+   } else if (!this->my_thread_running) {
+      Dmsg1(0, "Warning, can't send kill to jid=%d\n", this->JobId);
    }
+   this->unlock();
 }
 
 /*
index 48196a0bf46eebed24b464d0f7a33664540a1dc9..e24291cce422ca31dda18a9a07d304d7c4cc077d 100644 (file)
@@ -206,7 +206,7 @@ JCR     *jcr_walk_next(JCR *prev_jcr);
 void     jcr_walk_end(JCR *jcr);
 int      job_count();
 JCR     *get_jcr_from_tsd();
-void     set_jcr_in_tsd(JCR *jcr);
+void     set_jcr_in_tsd(JCR *jcr, bool update_thread_info=true);
 void     remove_jcr_from_tsd(JCR *jcr);
 uint32_t get_jobid_from_tsd();             
 uint32_t get_jobid_from_tid(pthread_t tid);
index 71f6f9d2a7ea59e761b07b038aaf3394dc07049d..8f19c7834fbd19d698e4d34bf4efd9ac7b368365 100644 (file)
@@ -616,8 +616,8 @@ void terminate_stored(int sig)
          fd = jcr->file_bsock;
          if (fd) {
             fd->set_timed_out();
+            jcr->my_thread_send_signal(TIMEOUT_SIGNAL);
             Dmsg1(100, "term_stored killing JobId=%d\n", jcr->JobId);
-            pthread_kill(jcr->my_thread_id, TIMEOUT_SIGNAL);
             /* ***FIXME*** wiffle through all dcrs */
             if (jcr->dcr && jcr->dcr->dev && jcr->dcr->dev->blocked()) {
                pthread_cond_broadcast(&jcr->dcr->dev->wait_next_vol);
@@ -662,7 +662,7 @@ void terminate_stored(int sig)
       config->free_resources();
       free(config);
       config = NULL;
-  }
+   }
 
    if (debug_level > 10) {
       print_memory_pool_stats();