From: Kern Sibbald <kern@sibbald.com>
Date: Mon, 21 May 2007 09:18:42 +0000 (+0000)
Subject: kes  Apply UTF-8/16 patch from Yves Orton <demerphq@gmail.com> to
X-Git-Tag: Release-7.0.0~6282
X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=145eb7ebe4a1c8acfc882d1b88251fc4c7fa2d1f;p=bacula%2Fbacula

kes  Apply UTF-8/16 patch from Yves Orton <demerphq@gmail.com> to
     clean up lex.c and make it more readable.

git-svn-id: https://bacula.svn.sourceforge.net/svnroot/bacula/trunk@4863 91ce42f0-d328-0410-95d8-f526ca767f89
---

diff --git a/bacula/README b/bacula/README
index 67c06b806b..8ac5920f1e 100644
--- a/bacula/README
+++ b/bacula/README
@@ -11,17 +11,18 @@ following:
 
 To Configure it:
 
-  ./configure \
-    --sbindir=$HOME/bacula/bin \
-    --sysconfdir=$HOME/bacula/bin \
-    --with-pid-dir=$HOME/bacula/bin \
-    --with-subsys-dir=$HOME/bacula/bin \
-    --enable-gnome \
-    --with-mysql \
-    --with-working-dir=$HOME/bacula/bin/working \
-    --with-dump-email=YOUR_EMAIL_ADDRESS \
-    --with-job-email=YOUR_EMAIL_ADDRESS \
-    --with-smtp-host=YOUR_SMTP_SERVER_ADDRESS
+        CFLAGS="-g -O2" \
+          ./configure \
+            --sbindir=$HOME/bacula/bin \
+            --sysconfdir=$HOME/bacula/bin \
+            --with-pid-dir=$HOME/bacula/bin/working \
+            --with-subsys-dir=$HOME/bacula/bin/working \
+            --enable-smartalloc \
+            --with-mysql \
+            --with-working-dir=$HOME/bacula/bin/working \
+            --with-dump-email=your@address.com \
+            --with-job-email=your@address.com \
+            --with-smtp-host=localhost
 
 
 Build Bacula:
diff --git a/bacula/src/lib/lex.c b/bacula/src/lib/lex.c
index 049f75e60a..94ff7dc654 100644
--- a/bacula/src/lib/lex.c
+++ b/bacula/src/lib/lex.c
@@ -291,6 +291,8 @@ static const char *lex_state_to_str(int state)
    case lex_identifier:    return _("identifier");
    case lex_string:        return _("string");
    case lex_quoted_string: return _("quoted_string");
+   case lex_utf8_bom:      return _("UTF-8 Byte Order Mark");
+   case lex_utf16_le_bom:  return _("UTF-16le Byte Order Mark");
    default:                return "??????";
    }
 }
@@ -318,6 +320,8 @@ const char *lex_tok_to_str(int token)
    case T_EOF:             return "T_EOF";
    case T_COMMA:           return "T_COMMA";
    case T_EOL:             return "T_EOL";
+   case T_UTF8_BOM:        return "T_UTF8_BOM";
+   case T_UTF16_BOM:       return "T_UTF16_BOM";
    default:                return "??????";
    }
 }
@@ -350,7 +354,13 @@ lex_get_token(LEX *lf, int expect)
    int ch;
    int token = T_NONE;
    bool esc_next = false;
-   int unicode_count = 0;
+   /* Unicode files, especially on Win32, may begin with a "Byte Order Mark"
+      to indicate which transmission format the file is in. The codepoint for
+      this mark is U+FEFF and is represented as the octets EF-BB-BF in UTF-8
+      and as FF-FE in UTF-16le(little endian) and  FE-FF in UTF-16(big endian).
+      We use a distinct state for UTF-8 and UTF-16le, and use bom_bytes_seen
+      to tell which byte we are expecting. */
+   int bom_bytes_seen = 0;
 
    Dmsg0(dbglvl, "enter lex_get_token\n");
    while (token == T_NONE) {
@@ -422,15 +432,25 @@ lex_get_token(LEX *lf, int expect)
             lf->state = lex_include;
             begin_str(lf, 0);
             break;
-         case 0xEF:
+         case 0xEF: /* probably a UTF-8 BOM */
+         case 0xFF: /* probably a UTF-16le BOM */
+         case 0xFE: /* probably a UTF-16be BOM (error)*/
             if (lf->line_no != 1 || lf->col_no != 1)
             {
                lf->state = lex_string;
                begin_str(lf, ch);
-               break;
+            } else {
+               bom_bytes_seen = 1;
+               if (ch == 0xEF) {
+                  lf->state = lex_utf8_bom;
+               } else if (ch == 0xFF) {
+                  lf->state = lex_utf16_le_bom;
+               } else {
+                  scan_err0(lf, _("This config file appears to be in an "
+                     "unsupported Unicode format (UTF-16be). Please resave as UTF-8\n"));
+                  return T_ERROR;
+               }
             }
-            lf->state = lex_unicode_mark;
-            unicode_count = 1;
             break;
          default:
             lf->state = lex_string;
@@ -550,7 +570,7 @@ lex_get_token(LEX *lf, int expect)
              ch == ';' || ch == ','   || ch == '"' || ch == '#') {
             /* Keep the original LEX so we can print an error if the included file can't be opened. */
             LEX* lfori = lf;
-            
+
             lf->state = lex_none;
             lf = lex_open_file(lf, lf->str, lf->scan_error);
             if (lf == NULL) {
@@ -563,26 +583,29 @@ lex_get_token(LEX *lf, int expect)
          }
          add_str(lf, ch);
          break;
-      case lex_unicode_mark:
-         if (ch == L_EOF) {
-            token = T_ERROR;
-            break;
-         }
-         unicode_count++;
-         if (unicode_count == 2) {
-            if (ch != 0xBB) {
-               token = T_ERROR;
-               break;
-            }
-         } else if (unicode_count == 3) {
-            if (ch != 0xBF) {
-               token = T_ERROR;
-               break;
-            }
-            token = T_UNICODE_MARK;
+      case lex_utf8_bom:
+         /* we only end up in this state if we have read an 0xEF 
+            as the first byte of the file, indicating we are probably
+            reading a UTF-8 file */
+         if (ch == 0xBB && bom_bytes_seen == 1) {
+            bom_bytes_seen++;
+         } else if (ch == 0xBF && bom_bytes_seen == 2) {
+            token = T_UTF8_BOM;
             lf->state = lex_none;
-            break;
-         }
+         } else {
+            token = T_ERROR;
+	 }
+         break;
+      case lex_utf16_le_bom:
+         /* we only end up in this state if we have read an 0xFF 
+            as the first byte of the file -- indicating that we are
+            probably dealing with an Intel based (little endian) UTF-16 file*/
+	 if (ch == 0xFE) {
+	    token = T_UTF16_BOM;
+	    lf->state = lex_none;
+	 } else {
+	    token = T_ERROR;
+	 }
          break;
       }
       Dmsg4(dbglvl, "ch=%d state=%s token=%s %c\n", ch, lex_state_to_str(lf->state),
diff --git a/bacula/src/lib/lex.h b/bacula/src/lib/lex.h
index 6a0caea39e..a5f9c39413 100644
--- a/bacula/src/lib/lex.h
+++ b/bacula/src/lib/lex.h
@@ -59,7 +59,8 @@
 #define T_COMMA                       111
 #define T_EOL                         112
 #define T_ERROR                       200
-#define T_UNICODE_MARK                201
+#define T_UTF8_BOM                    201 /* File starts with a UTF-8 BOM*/
+#define T_UTF16_BOM                   202 /* File starts with a UTF-16LE BOM*/
 
 /*
  * The following will be returned only if
@@ -85,7 +86,8 @@ enum lex_state {
    lex_string,
    lex_quoted_string,
    lex_include,
-   lex_unicode_mark
+   lex_utf8_bom,      /* we are parsing out a utf8 byte order mark */ 
+   lex_utf16_le_bom   /* we are parsing out a utf-16 (little endian) byte order mark */
 };
 
 /* Lex scan options */
diff --git a/bacula/src/lib/parse_conf.c b/bacula/src/lib/parse_conf.c
index 96f6d8adc9..4b1e840042 100644
--- a/bacula/src/lib/parse_conf.c
+++ b/bacula/src/lib/parse_conf.c
@@ -828,15 +828,17 @@ parse_config(const char *cf, LEX_ERROR_HANDLER *scan_error, int err_type)
          case p_none:
             if (token == T_EOL) {
                break;
-            }
-            if (token == T_UNICODE_MARK) {
+            } else if (token == T_UTF8_BOM) {
+               /* We can assume the file is UTF-8 as we have seen a UTF-8 BOM */
                break;
-            }
-            if (token != T_IDENTIFIER) {
+            } else if (token == T_UTF16_BOM) {
+               scan_err0(lc, _("Currently we cannot handle UTF-16 source files. Please convert to UTF-16\n"));
+               return 0;
+            } else if (token != T_IDENTIFIER) {
                scan_err1(lc, _("Expected a Resource name identifier, got: %s"), lc->str);
                return 0;
             }
-            for (i=0; resources[i].name; i++)
+            for (i=0; resources[i].name; i++) {
                if (strcasecmp(resources[i].name, lc->str) == 0) {
                   state = p_resource;
                   items = resources[i].items;
@@ -844,6 +846,7 @@ parse_config(const char *cf, LEX_ERROR_HANDLER *scan_error, int err_type)
                   init_resource(res_type, items, pass);
                   break;
                }
+            }
             if (state == p_none) {
                scan_err1(lc, _("expected resource name, got: %s"), lc->str);
                return 0;
diff --git a/bacula/technotes-2.1 b/bacula/technotes-2.1
index 1f6868101b..a352b9161a 100644
--- a/bacula/technotes-2.1
+++ b/bacula/technotes-2.1
@@ -1,6 +1,9 @@
               Technical notes on version 2.1
 
 General:
+21May07
+kes  Apply UTF-8/16 patch from Yves Orton <demerphq@gmail.com> to
+     clean up lex.c and make it more readable.
 20May07
 kes  Move more bnet functions into the BSOCK class.
 kes  Fix tray-monitor by not requiring a timer interval in bnet_connect()