kes Apply UTF-8/16 patch from Yves Orton <demerphq@gmail.com> to

author Kern Sibbald <kern@sibbald.com>

Mon, 21 May 2007 09:18:42 +0000 (09:18 +0000)

committer Kern Sibbald <kern@sibbald.com>

Mon, 21 May 2007 09:18:42 +0000 (09:18 +0000)
author Kern Sibbald <kern@sibbald.com>
Mon, 21 May 2007 09:18:42 +0000 (09:18 +0000)
committer Kern Sibbald <kern@sibbald.com>
Mon, 21 May 2007 09:18:42 +0000 (09:18 +0000)
diff --git a/bacula/README b/bacula/README

index 67c06b806b0a138ca9c5618bcbadf534b209b297..8ac5920f1effa5467f9af80eaac218c095a621b4 100644 (file)
--- a/bacula/README
+++ b/bacula/README
@@ -11,17 +11,18 @@ following:
  
  To Configure it:
  
-  ./configure \
-    --sbindir=$HOME/bacula/bin \
-    --sysconfdir=$HOME/bacula/bin \
-    --with-pid-dir=$HOME/bacula/bin \
-    --with-subsys-dir=$HOME/bacula/bin \
-    --enable-gnome \
-    --with-mysql \
-    --with-working-dir=$HOME/bacula/bin/working \
-    --with-dump-email=YOUR_EMAIL_ADDRESS \
-    --with-job-email=YOUR_EMAIL_ADDRESS \
-    --with-smtp-host=YOUR_SMTP_SERVER_ADDRESS
+        CFLAGS="-g -O2" \
+          ./configure \
+            --sbindir=$HOME/bacula/bin \
+            --sysconfdir=$HOME/bacula/bin \
+            --with-pid-dir=$HOME/bacula/bin/working \
+            --with-subsys-dir=$HOME/bacula/bin/working \
+            --enable-smartalloc \
+            --with-mysql \
+            --with-working-dir=$HOME/bacula/bin/working \
+            --with-dump-email=your@address.com \
+            --with-job-email=your@address.com \
+            --with-smtp-host=localhost
  
  
  Build Bacula:
diff --git a/bacula/src/lib/lex.c b/bacula/src/lib/lex.c

index 049f75e60a0086318314a47bf6eeedc9d89eced0..94ff7dc6542a930608f19d771efff71bdc91c277 100644 (file)
--- a/bacula/src/lib/lex.c
+++ b/bacula/src/lib/lex.c
@@ -291,6 +291,8 @@ static const char *lex_state_to_str(int state)
     case lex_identifier:    return _("identifier");
     case lex_string:        return _("string");
     case lex_quoted_string: return _("quoted_string");
+   case lex_utf8_bom:      return _("UTF-8 Byte Order Mark");
+   case lex_utf16_le_bom:  return _("UTF-16le Byte Order Mark");
     default:                return "??????";
     }
  }
@@ -318,6 +320,8 @@ const char *lex_tok_to_str(int token)
     case T_EOF:             return "T_EOF";
     case T_COMMA:           return "T_COMMA";
     case T_EOL:             return "T_EOL";
+   case T_UTF8_BOM:        return "T_UTF8_BOM";
+   case T_UTF16_BOM:       return "T_UTF16_BOM";
     default:                return "??????";
     }
  }
@@ -350,7 +354,13 @@ lex_get_token(LEX *lf, int expect)
     int ch;
     int token = T_NONE;
     bool esc_next = false;
-   int unicode_count = 0;
+   /* Unicode files, especially on Win32, may begin with a "Byte Order Mark"
+      to indicate which transmission format the file is in. The codepoint for
+      this mark is U+FEFF and is represented as the octets EF-BB-BF in UTF-8
+      and as FF-FE in UTF-16le(little endian) and  FE-FF in UTF-16(big endian).
+      We use a distinct state for UTF-8 and UTF-16le, and use bom_bytes_seen
+      to tell which byte we are expecting. */
+   int bom_bytes_seen = 0;
  
     Dmsg0(dbglvl, "enter lex_get_token\n");
     while (token == T_NONE) {
@@ -422,15 +432,25 @@ lex_get_token(LEX *lf, int expect)
              lf->state = lex_include;
              begin_str(lf, 0);
              break;
-         case 0xEF:
+         case 0xEF: /* probably a UTF-8 BOM */
+         case 0xFF: /* probably a UTF-16le BOM */
+         case 0xFE: /* probably a UTF-16be BOM (error)*/
              if (lf->line_no != 1 || lf->col_no != 1)
              {
                 lf->state = lex_string;
                 begin_str(lf, ch);
-               break;
+            } else {
+               bom_bytes_seen = 1;
+               if (ch == 0xEF) {
+                  lf->state = lex_utf8_bom;
+               } else if (ch == 0xFF) {
+                  lf->state = lex_utf16_le_bom;
+               } else {
+                  scan_err0(lf, _("This config file appears to be in an "
+                     "unsupported Unicode format (UTF-16be). Please resave as UTF-8\n"));
+                  return T_ERROR;
+               }
              }
-            lf->state = lex_unicode_mark;
-            unicode_count = 1;
              break;
           default:
              lf->state = lex_string;
@@ -550,7 +570,7 @@ lex_get_token(LEX *lf, int expect)
               ch == ';' || ch == ','   || ch == '"' || ch == '#') {
              /* Keep the original LEX so we can print an error if the included file can't be opened. */
              LEX* lfori = lf;
-            
+
              lf->state = lex_none;
              lf = lex_open_file(lf, lf->str, lf->scan_error);
              if (lf == NULL) {
@@ -563,26 +583,29 @@ lex_get_token(LEX *lf, int expect)
           }
           add_str(lf, ch);
           break;
-      case lex_unicode_mark:
-         if (ch == L_EOF) {
-            token = T_ERROR;
-            break;
-         }
-         unicode_count++;
-         if (unicode_count == 2) {
-            if (ch != 0xBB) {
-               token = T_ERROR;
-               break;
-            }
-         } else if (unicode_count == 3) {
-            if (ch != 0xBF) {
-               token = T_ERROR;
-               break;
-            }
-            token = T_UNICODE_MARK;
+      case lex_utf8_bom:
+         /* we only end up in this state if we have read an 0xEF 
+            as the first byte of the file, indicating we are probably
+            reading a UTF-8 file */
+         if (ch == 0xBB && bom_bytes_seen == 1) {
+            bom_bytes_seen++;
+         } else if (ch == 0xBF && bom_bytes_seen == 2) {
+            token = T_UTF8_BOM;
              lf->state = lex_none;
-            break;
-         }
+         } else {
+            token = T_ERROR;
+        }
+         break;
+      case lex_utf16_le_bom:
+         /* we only end up in this state if we have read an 0xFF 
+            as the first byte of the file -- indicating that we are
+            probably dealing with an Intel based (little endian) UTF-16 file*/
+        if (ch == 0xFE) {
+           token = T_UTF16_BOM;
+           lf->state = lex_none;
+        } else {
+           token = T_ERROR;
+        }
           break;
        }
        Dmsg4(dbglvl, "ch=%d state=%s token=%s %c\n", ch, lex_state_to_str(lf->state),
diff --git a/bacula/src/lib/lex.h b/bacula/src/lib/lex.h

index 6a0caea39eed0801b939b3cdd4ba950481fc3311..a5f9c394134774115f0827e7c36c617691eacfc0 100644 (file)
--- a/bacula/src/lib/lex.h
+++ b/bacula/src/lib/lex.h
@@ -59,7 +59,8 @@
  #define T_COMMA                       111
  #define T_EOL                         112
  #define T_ERROR                       200
-#define T_UNICODE_MARK                201
+#define T_UTF8_BOM                    201 /* File starts with a UTF-8 BOM*/
+#define T_UTF16_BOM                   202 /* File starts with a UTF-16LE BOM*/
  
  /*
   * The following will be returned only if
@@ -85,7 +86,8 @@ enum lex_state {
     lex_string,
     lex_quoted_string,
     lex_include,
-   lex_unicode_mark
+   lex_utf8_bom,      /* we are parsing out a utf8 byte order mark */ 
+   lex_utf16_le_bom   /* we are parsing out a utf-16 (little endian) byte order mark */
  };
  
  /* Lex scan options */
diff --git a/bacula/src/lib/parse_conf.c b/bacula/src/lib/parse_conf.c

index 96f6d8adc9c1a4c4c3db730437c7d6a7ba2d0a87..4b1e84004205f7748254e40eb9c9c60948560395 100644 (file)
--- a/bacula/src/lib/parse_conf.c
+++ b/bacula/src/lib/parse_conf.c
@@ -828,15 +828,17 @@ parse_config(const char *cf, LEX_ERROR_HANDLER *scan_error, int err_type)
           case p_none:
              if (token == T_EOL) {
                 break;
-            }
-            if (token == T_UNICODE_MARK) {
+            } else if (token == T_UTF8_BOM) {
+               /* We can assume the file is UTF-8 as we have seen a UTF-8 BOM */
                 break;
-            }
-            if (token != T_IDENTIFIER) {
+            } else if (token == T_UTF16_BOM) {
+               scan_err0(lc, _("Currently we cannot handle UTF-16 source files. Please convert to UTF-16\n"));
+               return 0;
+            } else if (token != T_IDENTIFIER) {
                 scan_err1(lc, _("Expected a Resource name identifier, got: %s"), lc->str);
                 return 0;
              }
-            for (i=0; resources[i].name; i++)
+            for (i=0; resources[i].name; i++) {
                 if (strcasecmp(resources[i].name, lc->str) == 0) {
                    state = p_resource;
                    items = resources[i].items;
@@ -844,6 +846,7 @@ parse_config(const char *cf, LEX_ERROR_HANDLER *scan_error, int err_type)
                    init_resource(res_type, items, pass);
                    break;
                 }
+            }
              if (state == p_none) {
                 scan_err1(lc, _("expected resource name, got: %s"), lc->str);
                 return 0;
diff --git a/bacula/technotes-2.1 b/bacula/technotes-2.1

index 1f6868101b984bc485b15d101e960d5e7eb589b2..a352b9161aadfce8cf5a1fc29095854e6037ab80 100644 (file)
--- a/bacula/technotes-2.1
+++ b/bacula/technotes-2.1
@@ -1,6 +1,9 @@
                Technical notes on version 2.1
  
  General:
+21May07
+kes  Apply UTF-8/16 patch from Yves Orton <demerphq@gmail.com> to
+     clean up lex.c and make it more readable.
  20May07
  kes  Move more bnet functions into the BSOCK class.
  kes  Fix tray-monitor by not requiring a timer interval in bnet_connect()
author	Kern Sibbald <kern@sibbald.com>
	Mon, 21 May 2007 09:18:42 +0000 (09:18 +0000)
committer	Kern Sibbald <kern@sibbald.com>
	Mon, 21 May 2007 09:18:42 +0000 (09:18 +0000)
bacula/README		patch \| blob \| history
bacula/src/lib/lex.c		patch \| blob \| history
bacula/src/lib/lex.h		patch \| blob \| history
bacula/src/lib/parse_conf.c		patch \| blob \| history
bacula/technotes-2.1		patch \| blob \| history