3 * Bacula Director -- msgchan.c -- handles the message channel
4 * to the Storage daemon and the File daemon.
6 * Kern Sibbald, August MM
8 * This routine runs as a thread and must be thread reentrant.
10 * Basic tasks done here:
11 * Open a message channel with the Storage daemon
12 * to authenticate ourself and to pass the JobId.
13 * Create a thread to interact with the Storage daemon
14 * who returns a job status and requests Catalog services, etc.
19 Copyright (C) 2000-2005 Kern Sibbald
21 This program is free software; you can redistribute it and/or
22 modify it under the terms of the GNU General Public License as
23 published by the Free Software Foundation; either version 2 of
24 the License, or (at your option) any later version.
26 This program is distributed in the hope that it will be useful,
27 but WITHOUT ANY WARRANTY; without even the implied warranty of
28 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
29 General Public License for more details.
31 You should have received a copy of the GNU General Public
32 License along with this program; if not, write to the Free
33 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
41 /* Commands sent to Storage daemon */
42 static char jobcmd[] = "JobId=%d job=%s job_name=%s client_name=%s "
43 "type=%d level=%d FileSet=%s NoAttr=%d SpoolAttr=%d FileSetMD5=%s "
44 "SpoolData=%d WritePartAfterJob=%d";
45 static char use_device[] = "use device=%s media_type=%s pool_name=%s "
46 "pool_type=%s PoolId=%s append=%d\n";
47 static char query_device[] = "query device=%s";
49 /* Response from Storage daemon */
50 static char OKjob[] = "3000 OK Job SDid=%d SDtime=%d Authorization=%100s\n";
51 static char OK_device[] = "3000 OK use device device=%s\n";
52 static char OK_query[] = "3001 OK query "
53 "append=%d read=%d num_writers=%d "
54 "open=%d labeled=%d offline=%d "
55 "reserved=%d max_writers=%d "
56 "autoselect=%d autochanger=%d "
58 "changer_name=%127s media_type=%127s volume_name=%127s";
60 /* Storage Daemon requests */
61 static char Job_start[] = "3010 Job %127s start\n";
62 static char Job_end[] =
63 "3099 Job %127s end JobStatus=%d JobFiles=%d JobBytes=%" lld "\n";
65 /* Forward referenced functions */
66 extern "C" void *msg_thread(void *arg);
69 * Establish a message channel connection with the Storage daemon
70 * and perform authentication.
72 bool connect_to_storage_daemon(JCR *jcr, int retry_interval,
73 int max_retry_time, int verbose)
78 if (jcr->store_bsock) {
79 return true; /* already connected */
81 store = (STORE *)jcr->storage->first();
84 * Open message channel with the Storage daemon
86 Dmsg2(100, "bnet_connect to Storage daemon %s:%d\n", store->address,
88 sd = bnet_connect(jcr, retry_interval, max_retry_time,
89 _("Storage daemon"), store->address,
90 NULL, store->SDport, verbose);
94 sd->res = (RES *)store; /* save pointer to other end */
95 jcr->store_bsock = sd;
97 if (!authenticate_storage_daemon(jcr, store)) {
99 jcr->store_bsock = NULL;
106 * Here we ask the SD to send us the info for a
107 * particular device resource.
109 bool update_device_res(JCR *jcr, DEVICE *dev)
111 POOL_MEM device_name, changer_name, media_type, volume_name;
112 int dev_open, dev_append, dev_read, dev_labeled;
113 int dev_offline, dev_autochanger, dev_autoselect;
115 if (!connect_to_storage_daemon(jcr, 5, 30, 0)) {
118 sd = jcr->store_bsock;
119 pm_strcpy(device_name, dev->hdr.name);
120 bash_spaces(device_name);
121 bnet_fsend(sd, query_device, device_name.c_str());
122 Dmsg1(100, ">stored: %s\n", sd->msg);
123 if (bget_dirmsg(sd) > 0) {
124 Dmsg1(100, "<stored: %s", sd->msg);
125 if (sscanf(sd->msg, OK_query,
126 &dev_append, &dev_read,
127 &dev->num_writers, &dev_open,
128 &dev_labeled, &dev_offline, &dev->reserved,
129 &dev->max_writers, &dev_autoselect,
130 &dev_autochanger, &dev->PoolId,
131 changer_name.c_str(), media_type.c_str(),
132 volume_name.c_str()) != 14) {
135 unbash_spaces(changer_name);
136 unbash_spaces(media_type);
137 unbash_spaces(volume_name);
138 bstrncpy(dev->ChangerName, changer_name.c_str(), sizeof(dev->ChangerName));
139 bstrncpy(dev->MediaType, media_type.c_str(), sizeof(dev->MediaType));
140 bstrncpy(dev->VolumeName, volume_name.c_str(), sizeof(dev->VolumeName));
141 /* Note, these are copied because they are boolean rather than
144 dev->open = dev_open;
145 dev->append = dev_append;
146 dev->read = dev_read;
147 dev->labeled = dev_labeled;
148 dev->offline = dev_offline;
149 dev->autoselect = dev_autoselect;
150 dev->autochanger = dev_autochanger;
159 * Start a job with the Storage daemon
161 int start_storage_daemon_job(JCR *jcr, alist *store, int append)
167 POOL_MEM device_name, pool_name, pool_type, media_type;
170 sd = jcr->store_bsock;
172 * Now send JobId and permissions, and get back the authorization key.
174 bash_spaces(jcr->job->hdr.name);
175 bash_spaces(jcr->client->hdr.name);
176 bash_spaces(jcr->fileset->hdr.name);
177 if (jcr->fileset->MD5[0] == 0) {
178 bstrncpy(jcr->fileset->MD5, "**Dummy**", sizeof(jcr->fileset->MD5));
180 bnet_fsend(sd, jobcmd, jcr->JobId, jcr->Job, jcr->job->hdr.name,
181 jcr->client->hdr.name, jcr->JobType, jcr->JobLevel,
182 jcr->fileset->hdr.name, !jcr->pool->catalog_files,
183 jcr->job->SpoolAttributes, jcr->fileset->MD5, jcr->spool_data, jcr->write_part_after_job);
184 Dmsg1(100, ">stored: %s\n", sd->msg);
185 unbash_spaces(jcr->job->hdr.name);
186 unbash_spaces(jcr->client->hdr.name);
187 unbash_spaces(jcr->fileset->hdr.name);
188 if (bget_dirmsg(sd) > 0) {
189 Dmsg1(100, "<stored: %s", sd->msg);
190 if (sscanf(sd->msg, OKjob, &jcr->VolSessionId,
191 &jcr->VolSessionTime, &auth_key) != 3) {
192 Dmsg1(100, "BadJob=%s\n", sd->msg);
193 Jmsg(jcr, M_FATAL, 0, _("Storage daemon rejected Job command: %s\n"), sd->msg);
196 jcr->sd_auth_key = bstrdup(auth_key);
197 Dmsg1(150, "sd_auth_key=%s\n", jcr->sd_auth_key);
200 Jmsg(jcr, M_FATAL, 0, _("<stored: bad response to Job command: %s\n"),
205 pm_strcpy(pool_type, jcr->pool->pool_type);
206 pm_strcpy(pool_name, jcr->pool->hdr.name);
207 bash_spaces(pool_type);
208 bash_spaces(pool_name);
209 edit_int64(jcr->PoolId, PoolId);
212 * We have two loops here. The first comes from the
213 * Storage = associated with the Job, and we need
214 * to attach to each one.
215 * The inner loop loops over all the alternative devices
216 * associated with each Storage. It selects the first
219 * Note, the outer loop is not yet implemented.
221 // foreach_alist(storage, store) {
222 storage = (STORE *)store->first();
224 /* Loop over alternative storages until one is OK */
225 foreach_alist(dev, storage->device) {
226 pm_strcpy(device_name, dev->hdr.name);
227 pm_strcpy(media_type, storage->media_type);
228 bash_spaces(device_name);
229 bash_spaces(media_type);
230 bnet_fsend(sd, use_device, device_name.c_str(),
231 media_type.c_str(), pool_name.c_str(), pool_type.c_str(),
233 Dmsg1(100, ">stored: %s", sd->msg);
234 if (bget_dirmsg(sd) > 0) {
235 Dmsg1(100, "<stored: %s", sd->msg);
236 /* ****FIXME**** save actual device name */
237 ok = sscanf(sd->msg, OK_device, device_name.c_str()) == 1;
243 pm_strcpy(err_msg, sd->msg); /* save message */
244 Jmsg(jcr, M_WARNING, 0, _("\n"
245 " Storage daemon didn't accept Device \"%s\" because:\n %s"),
246 device_name.c_str(), err_msg.c_str()/* sd->msg */);
254 ok = bnet_fsend(sd, "run");
255 Dmsg1(100, ">stored: %s\n", sd->msg);
261 * Start a thread to handle Storage daemon messages and
264 int start_storage_daemon_message_thread(JCR *jcr)
270 jcr->use_count++; /* mark in use by msg thread */
271 jcr->sd_msg_thread_done = false;
272 jcr->SD_msg_chan = 0;
274 Dmsg0(100, "Start SD msg_thread.\n");
275 if ((status=pthread_create(&thid, NULL, msg_thread, (void *)jcr)) != 0) {
277 Jmsg1(jcr, M_ABORT, 0, _("Cannot create message thread: %s\n"), be.strerror(status));
279 Dmsg0(100, "SD msg_thread started.\n");
280 /* Wait for thread to start */
281 while (jcr->SD_msg_chan == 0) {
287 extern "C" void msg_thread_cleanup(void *arg)
289 JCR *jcr = (JCR *)arg;
290 Dmsg0(200, "End msg_thread\n");
291 db_end_transaction(jcr, jcr->db); /* terminate any open transaction */
293 jcr->sd_msg_thread_done = true;
294 pthread_cond_broadcast(&jcr->term_wait); /* wakeup any waiting threads */
295 jcr->SD_msg_chan = 0;
297 free_jcr(jcr); /* release jcr */
301 * Handle the message channel (i.e. requests from the
303 * Note, we are running in a separate thread.
305 extern "C" void *msg_thread(void *arg)
307 JCR *jcr = (JCR *)arg;
310 char Job[MAX_NAME_LENGTH];
315 pthread_detach(pthread_self());
316 jcr->SD_msg_chan = pthread_self();
317 pthread_cleanup_push(msg_thread_cleanup, arg);
318 sd = jcr->store_bsock;
320 /* Read the Storage daemon's output.
322 Dmsg0(100, "Start msg_thread loop\n");
323 while ((stat=bget_dirmsg(sd)) >= 0) {
324 Dmsg1(200, "<stored: %s", sd->msg);
325 if (sscanf(sd->msg, Job_start, &Job) == 1) {
328 if (sscanf(sd->msg, Job_end, &Job, &JobStatus, &JobFiles,
330 jcr->SDJobStatus = JobStatus; /* termination status */
331 jcr->SDJobFiles = JobFiles;
332 jcr->SDJobBytes = JobBytes;
336 if (is_bnet_error(sd)) {
337 jcr->SDJobStatus = JS_ErrorTerminated;
339 pthread_cleanup_pop(1);
343 void wait_for_storage_daemon_termination(JCR *jcr)
345 int cancel_count = 0;
346 /* Now wait for Storage daemon to terminate our message thread */
347 set_jcr_job_status(jcr, JS_WaitSD);
349 while (!jcr->sd_msg_thread_done) {
352 struct timespec timeout;
354 gettimeofday(&tv, &tz);
356 timeout.tv_sec = tv.tv_sec + 10; /* wait 10 seconds */
357 Dmsg0(300, "I'm waiting for message thread termination.\n");
358 pthread_cond_timedwait(&jcr->term_wait, &jcr->mutex, &timeout);
359 if (job_canceled(jcr)) {
362 /* Give SD 30 seconds to clean up after cancel */
363 if (cancel_count == 3) {
368 set_jcr_job_status(jcr, JS_Terminated);
374 extern "C" void *device_thread(void *arg)
381 pthread_detach(pthread_self());
382 jcr = new_control_jcr("*DeviceInit*", JT_SYSTEM);
383 for (i=0; i < MAX_TRIES; i++) {
384 if (!connect_to_storage_daemon(jcr, 10, 30, 1)) {
385 Dmsg0(000, "Failed connecting to SD.\n");
389 foreach_res(dev, R_DEVICE) {
390 if (!update_device_res(jcr, dev)) {
391 Dmsg1(900, "Error updating device=%s\n", dev->hdr.name);
393 Dmsg1(900, "Updated Device=%s\n", dev->hdr.name);
397 bnet_close(jcr->store_bsock);
398 jcr->store_bsock = NULL;
407 * Start a thread to handle getting Device resource information
408 * from SD. This is called once at startup of the Director.
410 void init_device_resources()
415 Dmsg0(100, "Start Device thread.\n");
416 if ((status=pthread_create(&thid, NULL, device_thread, NULL)) != 0) {
418 Jmsg1(NULL, M_ABORT, 0, _("Cannot create message thread: %s\n"), be.strerror(status));