From: Kern Sibbald Date: Mon, 21 May 2007 09:18:42 +0000 (+0000) Subject: kes Apply UTF-8/16 patch from Yves Orton to X-Git-Tag: Release-7.0.0~6282 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=145eb7ebe4a1c8acfc882d1b88251fc4c7fa2d1f;p=bacula%2Fbacula kes Apply UTF-8/16 patch from Yves Orton to clean up lex.c and make it more readable. git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@4863 91ce42f0-d328-0410-95d8-f526ca767f89 --- diff --git a/bacula/README b/bacula/README index 67c06b806b..8ac5920f1e 100644 --- a/bacula/README +++ b/bacula/README @@ -11,17 +11,18 @@ following: To Configure it: - ./configure \ - --sbindir=$HOME/bacula/bin \ - --sysconfdir=$HOME/bacula/bin \ - --with-pid-dir=$HOME/bacula/bin \ - --with-subsys-dir=$HOME/bacula/bin \ - --enable-gnome \ - --with-mysql \ - --with-working-dir=$HOME/bacula/bin/working \ - --with-dump-email=YOUR_EMAIL_ADDRESS \ - --with-job-email=YOUR_EMAIL_ADDRESS \ - --with-smtp-host=YOUR_SMTP_SERVER_ADDRESS + CFLAGS="-g -O2" \ + ./configure \ + --sbindir=$HOME/bacula/bin \ + --sysconfdir=$HOME/bacula/bin \ + --with-pid-dir=$HOME/bacula/bin/working \ + --with-subsys-dir=$HOME/bacula/bin/working \ + --enable-smartalloc \ + --with-mysql \ + --with-working-dir=$HOME/bacula/bin/working \ + --with-dump-email=your@address.com \ + --with-job-email=your@address.com \ + --with-smtp-host=localhost Build Bacula: diff --git a/bacula/src/lib/lex.c b/bacula/src/lib/lex.c index 049f75e60a..94ff7dc654 100644 --- a/bacula/src/lib/lex.c +++ b/bacula/src/lib/lex.c @@ -291,6 +291,8 @@ static const char *lex_state_to_str(int state) case lex_identifier: return _("identifier"); case lex_string: return _("string"); case lex_quoted_string: return _("quoted_string"); + case lex_utf8_bom: return _("UTF-8 Byte Order Mark"); + case lex_utf16_le_bom: return _("UTF-16le Byte Order Mark"); default: return "??????"; } } @@ -318,6 +320,8 @@ const char *lex_tok_to_str(int token) case T_EOF: return "T_EOF"; case T_COMMA: return "T_COMMA"; case T_EOL: return "T_EOL"; + case T_UTF8_BOM: return "T_UTF8_BOM"; + case T_UTF16_BOM: return "T_UTF16_BOM"; default: return "??????"; } } @@ -350,7 +354,13 @@ lex_get_token(LEX *lf, int expect) int ch; int token = T_NONE; bool esc_next = false; - int unicode_count = 0; + /* Unicode files, especially on Win32, may begin with a "Byte Order Mark" + to indicate which transmission format the file is in. The codepoint for + this mark is U+FEFF and is represented as the octets EF-BB-BF in UTF-8 + and as FF-FE in UTF-16le(little endian) and FE-FF in UTF-16(big endian). + We use a distinct state for UTF-8 and UTF-16le, and use bom_bytes_seen + to tell which byte we are expecting. */ + int bom_bytes_seen = 0; Dmsg0(dbglvl, "enter lex_get_token\n"); while (token == T_NONE) { @@ -422,15 +432,25 @@ lex_get_token(LEX *lf, int expect) lf->state = lex_include; begin_str(lf, 0); break; - case 0xEF: + case 0xEF: /* probably a UTF-8 BOM */ + case 0xFF: /* probably a UTF-16le BOM */ + case 0xFE: /* probably a UTF-16be BOM (error)*/ if (lf->line_no != 1 || lf->col_no != 1) { lf->state = lex_string; begin_str(lf, ch); - break; + } else { + bom_bytes_seen = 1; + if (ch == 0xEF) { + lf->state = lex_utf8_bom; + } else if (ch == 0xFF) { + lf->state = lex_utf16_le_bom; + } else { + scan_err0(lf, _("This config file appears to be in an " + "unsupported Unicode format (UTF-16be). Please resave as UTF-8\n")); + return T_ERROR; + } } - lf->state = lex_unicode_mark; - unicode_count = 1; break; default: lf->state = lex_string; @@ -550,7 +570,7 @@ lex_get_token(LEX *lf, int expect) ch == ';' || ch == ',' || ch == '"' || ch == '#') { /* Keep the original LEX so we can print an error if the included file can't be opened. */ LEX* lfori = lf; - + lf->state = lex_none; lf = lex_open_file(lf, lf->str, lf->scan_error); if (lf == NULL) { @@ -563,26 +583,29 @@ lex_get_token(LEX *lf, int expect) } add_str(lf, ch); break; - case lex_unicode_mark: - if (ch == L_EOF) { - token = T_ERROR; - break; - } - unicode_count++; - if (unicode_count == 2) { - if (ch != 0xBB) { - token = T_ERROR; - break; - } - } else if (unicode_count == 3) { - if (ch != 0xBF) { - token = T_ERROR; - break; - } - token = T_UNICODE_MARK; + case lex_utf8_bom: + /* we only end up in this state if we have read an 0xEF + as the first byte of the file, indicating we are probably + reading a UTF-8 file */ + if (ch == 0xBB && bom_bytes_seen == 1) { + bom_bytes_seen++; + } else if (ch == 0xBF && bom_bytes_seen == 2) { + token = T_UTF8_BOM; lf->state = lex_none; - break; - } + } else { + token = T_ERROR; + } + break; + case lex_utf16_le_bom: + /* we only end up in this state if we have read an 0xFF + as the first byte of the file -- indicating that we are + probably dealing with an Intel based (little endian) UTF-16 file*/ + if (ch == 0xFE) { + token = T_UTF16_BOM; + lf->state = lex_none; + } else { + token = T_ERROR; + } break; } Dmsg4(dbglvl, "ch=%d state=%s token=%s %c\n", ch, lex_state_to_str(lf->state), diff --git a/bacula/src/lib/lex.h b/bacula/src/lib/lex.h index 6a0caea39e..a5f9c39413 100644 --- a/bacula/src/lib/lex.h +++ b/bacula/src/lib/lex.h @@ -59,7 +59,8 @@ #define T_COMMA 111 #define T_EOL 112 #define T_ERROR 200 -#define T_UNICODE_MARK 201 +#define T_UTF8_BOM 201 /* File starts with a UTF-8 BOM*/ +#define T_UTF16_BOM 202 /* File starts with a UTF-16LE BOM*/ /* * The following will be returned only if @@ -85,7 +86,8 @@ enum lex_state { lex_string, lex_quoted_string, lex_include, - lex_unicode_mark + lex_utf8_bom, /* we are parsing out a utf8 byte order mark */ + lex_utf16_le_bom /* we are parsing out a utf-16 (little endian) byte order mark */ }; /* Lex scan options */ diff --git a/bacula/src/lib/parse_conf.c b/bacula/src/lib/parse_conf.c index 96f6d8adc9..4b1e840042 100644 --- a/bacula/src/lib/parse_conf.c +++ b/bacula/src/lib/parse_conf.c @@ -828,15 +828,17 @@ parse_config(const char *cf, LEX_ERROR_HANDLER *scan_error, int err_type) case p_none: if (token == T_EOL) { break; - } - if (token == T_UNICODE_MARK) { + } else if (token == T_UTF8_BOM) { + /* We can assume the file is UTF-8 as we have seen a UTF-8 BOM */ break; - } - if (token != T_IDENTIFIER) { + } else if (token == T_UTF16_BOM) { + scan_err0(lc, _("Currently we cannot handle UTF-16 source files. Please convert to UTF-16\n")); + return 0; + } else if (token != T_IDENTIFIER) { scan_err1(lc, _("Expected a Resource name identifier, got: %s"), lc->str); return 0; } - for (i=0; resources[i].name; i++) + for (i=0; resources[i].name; i++) { if (strcasecmp(resources[i].name, lc->str) == 0) { state = p_resource; items = resources[i].items; @@ -844,6 +846,7 @@ parse_config(const char *cf, LEX_ERROR_HANDLER *scan_error, int err_type) init_resource(res_type, items, pass); break; } + } if (state == p_none) { scan_err1(lc, _("expected resource name, got: %s"), lc->str); return 0; diff --git a/bacula/technotes-2.1 b/bacula/technotes-2.1 index 1f6868101b..a352b9161a 100644 --- a/bacula/technotes-2.1 +++ b/bacula/technotes-2.1 @@ -1,6 +1,9 @@ Technical notes on version 2.1 General: +21May07 +kes Apply UTF-8/16 patch from Yves Orton to + clean up lex.c and make it more readable. 20May07 kes Move more bnet functions into the BSOCK class. kes Fix tray-monitor by not requiring a timer interval in bnet_connect()