2 Bacula® - The Network Backup Solution
4 Copyright (C) 2000-2011 Free Software Foundation Europe e.V.
6 The main author of Bacula is Kern Sibbald, with contributions from
7 many others, a complete list can be found in the file AUTHORS.
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Bacula® is a registered trademark of Kern Sibbald.
24 The licensor of Bacula is the Free Software Foundation Europe
25 (FSFE), Fiduciary Program, Sumatrastrasse 25, 8006 Zürich,
26 Switzerland, email:ftf@fsfeurope.org.
29 * Job control and execution for Storage Daemon
38 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
40 /* Imported variables */
41 extern uint32_t VolSessionTime;
43 /* Imported functions */
44 extern uint32_t newVolSessionId();
45 extern bool do_mac(JCR *jcr);
47 /* Requests from the Director daemon */
48 /* Added in 3.1.4 14Sep09 KES */
49 static char jobcmd[] = "JobId=%d job=%127s job_name=%127s client_name=%127s "
50 "type=%d level=%d FileSet=%127s NoAttr=%d SpoolAttr=%d FileSetMD5=%127s "
51 "SpoolData=%d WritePartAfterJob=%d PreferMountedVols=%d SpoolSize=%s "
52 "rerunning=%d VolSessionId=%d VolSessionTime=%d\n";
54 /* Responses sent to Director daemon */
55 static char OKjob[] = "3000 OK Job SDid=%u SDtime=%u Authorization=%s\n";
56 static char BAD_job[] = "3915 Bad Job command. stat=%d CMD: %s\n";
59 * Director requests us to start a job
60 * Basic tasks done here:
61 * - We pickup the JobId to be run from the Director.
62 * - We pickup the device, media, and pool from the Director
63 * - Wait for a connection from the File Daemon (FD)
64 * - Accept commands from the FD (i.e. run the job)
65 * - Return when the connection is terminated or
68 bool job_cmd(JCR *jcr)
74 BSOCK *dir = jcr->dir_bsock;
75 POOL_MEM job_name, client_name, job, fileset_name, fileset_md5;
76 int32_t JobType, level, spool_attributes, no_attributes, spool_data;
77 int32_t write_part_after_job, PreferMountedVols, rerunning;
82 * Get JobId and permissions from Director
84 Dmsg1(100, "<dird: %s", dir->msg);
85 bstrncpy(spool_size, "0", sizeof(spool_size));
86 stat = sscanf(dir->msg, jobcmd, &JobId, job.c_str(), job_name.c_str(),
88 &JobType, &level, fileset_name.c_str(), &no_attributes,
89 &spool_attributes, fileset_md5.c_str(), &spool_data,
90 &write_part_after_job, &PreferMountedVols, spool_size,
91 &rerunning, &jcr->VolSessionId, &jcr->VolSessionTime);
93 pm_strcpy(jcr->errmsg, dir->msg);
94 dir->fsend(BAD_job, stat, jcr->errmsg);
95 Dmsg1(100, ">dird: %s", dir->msg);
96 jcr->setJobStatus(JS_ErrorTerminated);
99 jcr->rerunning = (rerunning) ? true : false;
100 Dmsg3(100, "==== rerunning=%d VolSesId=%d VolSesTime=%d\n", jcr->rerunning,
101 jcr->VolSessionId, jcr->VolSessionTime);
103 * Since this job could be rescheduled, we
104 * check to see if we have it already. If so
105 * free the old jcr and use the new one.
107 ojcr = get_jcr_by_full_name(job.c_str());
108 if (ojcr && !ojcr->authenticated) {
109 Dmsg2(100, "Found ojcr=0x%x Job %s\n", (unsigned)(intptr_t)ojcr, job.c_str());
113 Dmsg2(800, "Start JobId=%d %p\n", JobId, jcr);
115 * If job rescheduled because previous was incomplete,
116 * the Resched flag is set and VolSessionId and VolSessionTime
117 * are given to us (same as restarted job).
119 if (!jcr->rerunning) {
120 jcr->VolSessionId = newVolSessionId();
121 jcr->VolSessionTime = VolSessionTime;
123 bstrncpy(jcr->Job, job, sizeof(jcr->Job));
124 unbash_spaces(job_name);
125 jcr->job_name = get_pool_memory(PM_NAME);
126 pm_strcpy(jcr->job_name, job_name);
127 unbash_spaces(client_name);
128 jcr->client_name = get_pool_memory(PM_NAME);
129 pm_strcpy(jcr->client_name, client_name);
130 unbash_spaces(fileset_name);
131 jcr->fileset_name = get_pool_memory(PM_NAME);
132 pm_strcpy(jcr->fileset_name, fileset_name);
133 jcr->setJobType(JobType);
134 jcr->setJobLevel(level);
135 jcr->no_attributes = no_attributes;
136 jcr->spool_attributes = spool_attributes;
137 jcr->spool_data = spool_data;
138 jcr->spool_size = str_to_int64(spool_size);
139 jcr->write_part_after_job = write_part_after_job;
140 jcr->fileset_md5 = get_pool_memory(PM_NAME);
141 pm_strcpy(jcr->fileset_md5, fileset_md5);
142 jcr->PreferMountedVols = PreferMountedVols;
145 jcr->authenticated = false;
148 * Pass back an authorization key for the File daemon
150 bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId);
151 make_session_key(auth_key, seed, 1);
152 dir->fsend(OKjob, jcr->VolSessionId, jcr->VolSessionTime, auth_key);
153 Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
154 jcr->sd_auth_key = bstrdup(auth_key);
155 memset(auth_key, 0, sizeof(auth_key));
156 new_plugins(jcr); /* instantiate the plugins */
157 generate_daemon_event(jcr, "JobStart");
158 generate_plugin_event(jcr, bsdEventJobStart, (void *)"JobStart");
162 bool run_cmd(JCR *jcr)
166 struct timespec timeout;
170 Dmsg1(200, "Run_cmd: %s\n", jcr->dir_bsock->msg);
172 /* If we do not need the FD, we are doing a migrate, copy, or virtual
175 if (jcr->no_client_used()) {
180 jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
182 gettimeofday(&tv, &tz);
183 timeout.tv_nsec = tv.tv_usec * 1000;
184 timeout.tv_sec = tv.tv_sec + me->client_wait;
186 Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n",
187 jcr->Job, (int)(timeout.tv_sec-time(NULL)), jcr->sd_auth_key);
188 Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
191 * Wait for the File daemon to contact us to start the Job,
192 * when he does, we will be released, unless the 30 minutes
196 while ( !jcr->authenticated && !job_canceled(jcr) ) {
197 errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout);
198 if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
201 Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
203 Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
204 job_canceled(jcr), errstat);
206 Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
208 memset(jcr->sd_auth_key, 0, strlen(jcr->sd_auth_key));
210 if (jcr->authenticated && !job_canceled(jcr)) {
211 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
212 run_job(jcr); /* Run the job */
214 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
219 * After receiving a connection (in dircmd.c) if it is
220 * from the File daemon, this routine is called.
222 void handle_filed_connection(BSOCK *fd, char *job_name)
227 * With the following bmicrosleep on, running the
228 * SD under the debugger fails.
230 // bmicrosleep(0, 50000); /* wait 50 millisecs */
231 if (!(jcr=get_jcr_by_full_name(job_name))) {
232 Jmsg1(NULL, M_FATAL, 0, _("FD connect failed: Job name not found: %s\n"), job_name);
233 Dmsg1(3, "**** Job \"%s\" not found.\n", job_name);
239 Dmsg1(50, "Found Job %s\n", job_name);
241 if (jcr->authenticated) {
242 Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s already authenticated.\n"),
243 (uint32_t)jcr->JobId, jcr->Job);
244 Dmsg2(50, "Hey!!!! JobId %u Job %s already authenticated.\n",
245 (uint32_t)jcr->JobId, jcr->Job);
251 jcr->file_bsock = fd;
252 jcr->file_bsock->set_jcr(jcr);
255 * Authenticate the File daemon
257 if (jcr->authenticated || !authenticate_filed(jcr)) {
258 Dmsg1(50, "Authentication failed Job %s\n", jcr->Job);
259 Jmsg(jcr, M_FATAL, 0, _("Unable to authenticate File daemon\n"));
261 jcr->authenticated = true;
262 Dmsg2(50, "OK Authentication jid=%u Job %s\n", (uint32_t)jcr->JobId, jcr->Job);
265 if (!jcr->authenticated) {
266 jcr->setJobStatus(JS_ErrorTerminated);
268 pthread_cond_signal(&jcr->job_start_wait); /* wake waiting job */
276 * Query Device command from Director
277 * Sends Storage Daemon's information on the device to the
278 * caller (presumably the Director).
279 * This command always returns "true" so that the line is
280 * not closed on an error.
283 bool query_cmd(JCR *jcr)
285 POOL_MEM dev_name, VolumeName, MediaType, ChangerName;
286 BSOCK *dir = jcr->dir_bsock;
288 AUTOCHANGER *changer;
291 Dmsg1(100, "Query_cmd: %s", dir->msg);
292 ok = sscanf(dir->msg, query_device, dev_name.c_str()) == 1;
293 Dmsg1(100, "<dird: %s\n", dir->msg);
295 unbash_spaces(dev_name);
296 foreach_res(device, R_DEVICE) {
297 /* Find resource, and make sure we were able to open it */
298 if (strcmp(dev_name.c_str(), device->hdr.name) == 0) {
300 device->dev = init_dev(jcr, device);
305 ok = dir_update_device(jcr, device->dev);
307 ok = dir->fsend(OK_query);
309 dir->fsend(NO_query);
314 foreach_res(changer, R_AUTOCHANGER) {
315 /* Find resource, and make sure we were able to open it */
316 if (strcmp(dev_name.c_str(), changer->hdr.name) == 0) {
317 if (!changer->device || changer->device->size() == 0) {
318 continue; /* no devices */
320 ok = dir_update_changer(jcr, changer);
322 ok = dir->fsend(OK_query);
324 dir->fsend(NO_query);
329 /* If we get here, the device/autochanger was not found */
330 unbash_spaces(dir->msg);
331 pm_strcpy(jcr->errmsg, dir->msg);
332 dir->fsend(NO_device, dev_name.c_str());
333 Dmsg1(100, ">dird: %s\n", dir->msg);
335 unbash_spaces(dir->msg);
336 pm_strcpy(jcr->errmsg, dir->msg);
337 dir->fsend(BAD_query, jcr->errmsg);
338 Dmsg1(100, ">dird: %s\n", dir->msg);
348 * Destroy the Job Control Record and associated
349 * resources (sockets).
351 void stored_free_jcr(JCR *jcr)
353 Dmsg2(800, "End Job JobId=%u %p\n", jcr->JobId, jcr);
354 if (jcr->dir_bsock) {
355 Dmsg2(800, "Send terminate jid=%d %p\n", jcr->JobId, jcr);
356 jcr->dir_bsock->signal(BNET_EOD);
357 jcr->dir_bsock->signal(BNET_TERMINATE);
359 if (jcr->file_bsock) {
360 jcr->file_bsock->close();
361 jcr->file_bsock = NULL;
364 free_pool_memory(jcr->job_name);
366 if (jcr->client_name) {
367 free_memory(jcr->client_name);
368 jcr->client_name = NULL;
370 if (jcr->fileset_name) {
371 free_memory(jcr->fileset_name);
373 if (jcr->fileset_md5) {
374 free_memory(jcr->fileset_md5);
380 /* Free any restore volume list created */
381 free_restore_volume_list(jcr);
382 if (jcr->RestoreBootstrap) {
383 unlink(jcr->RestoreBootstrap);
384 free_pool_memory(jcr->RestoreBootstrap);
385 jcr->RestoreBootstrap = NULL;
387 if (jcr->next_dev || jcr->prev_dev) {
388 Emsg0(M_FATAL, 0, _("In free_jcr(), but still attached to device!!!!\n"));
390 pthread_cond_destroy(&jcr->job_start_wait);
396 /* Avoid a double free */
397 if (jcr->dcr == jcr->read_dcr) {
398 jcr->read_dcr = NULL;
405 free_dcr(jcr->read_dcr);
406 jcr->read_dcr = NULL;
409 if (jcr->read_store) {
411 foreach_alist(store, jcr->read_store) {
412 delete store->device;
415 delete jcr->read_store;
416 jcr->read_store = NULL;
418 if (jcr->write_store) {
420 foreach_alist(store, jcr->write_store) {
421 delete store->device;
424 delete jcr->write_store;
425 jcr->write_store = NULL;
430 write_state_file(me->working_directory, "bacula-sd", get_first_port_host_order(me->sdaddrs));