pcre: enable UCP, UTF-8 (if available), extend t/19-match

author Michael Stapelberg <michael@stapelberg.de>

Sun, 11 Sep 2011 10:40:51 +0000 (11:40 +0100)

committer Michael Stapelberg <michael@stapelberg.de>

Sun, 11 Sep 2011 10:40:51 +0000 (11:40 +0100)
author Michael Stapelberg <michael@stapelberg.de>
Sun, 11 Sep 2011 10:40:51 +0000 (11:40 +0100)
committer Michael Stapelberg <michael@stapelberg.de>
Sun, 11 Sep 2011 10:40:51 +0000 (11:40 +0100)
diff --git a/src/regex.c b/src/regex.c

index df01dae4ed8f72a639cbf821122b6d867df25f81..da8f91d8a67fbd4f09e35fc0a512598d0c0ccd11 100644 (file)
--- a/src/regex.c
+++ b/src/regex.c
@@ -21,16 +21,31 @@
   */
  struct regex *regex_new(const char *pattern) {
      const char *error;
-    int offset;
+    int errorcode, offset;
  
      struct regex *re = scalloc(sizeof(struct regex));
      re->pattern = sstrdup(pattern);
-    if (!(re->regex = pcre_compile(pattern, 0, &error, &offset, NULL))) {
-        ELOG("PCRE regular expression compilation failed at %d: %s",
+    /* We use PCRE_UCP so that \B, \b, \D, \d, \S, \s, \W, \w and some POSIX
+     * character classes play nicely with Unicode */
+    int options = PCRE_UCP | PCRE_UTF8;
+    while (!(re->regex = pcre_compile2(pattern, options, &errorcode, &error, &offset, NULL))) {
+        /* If the error is that PCRE was not compiled with UTF-8 support we
+         * disable it and try again */
+        if (errorcode == 32) {
+            options &= ~PCRE_UTF8;
+            continue;
+        }
+        ELOG("PCRE regular expression compilation failed at %d: %s\n",
               offset, error);
          return NULL;
      }
      re->extra = pcre_study(re->regex, 0, &error);
+    /* If an error happened, we print the error message, but continue.
+     * Studying the regular expression leads to faster matching, but it’s not
+     * absolutely necessary. */
+    if (error) {
+        ELOG("PCRE regular expression studying failed: %s\n", error);
+    }
      return re;
  }
  
@@ -43,8 +58,8 @@ struct regex *regex_new(const char *pattern) {
  bool regex_matches(struct regex *regex, const char *input) {
      int rc;
  
-    /* TODO: is strlen(input) correct for UTF-8 matching? */
-    /* TODO: enable UTF-8 */
+    /* We use strlen() because pcre_exec() expects the length of the input
+     * string in bytes */
      if ((rc = pcre_exec(regex->regex, regex->extra, input, strlen(input), 0, 0, NULL, 0)) == 0) {
          LOG("Regular expression \"%s\" matches \"%s\"\n",
              regex->pattern, input);
@@ -57,7 +72,7 @@ bool regex_matches(struct regex *regex, const char *input) {
          return false;
      }
  
-    /* TODO: handle the other error codes */
-    LOG("PCRE error\n");
+    ELOG("PCRE error %d while trying to use regular expression \"%s\" on input \"%s\", see pcreapi(3)\n",
+         rc, regex->pattern, input);
      return false;
  }
diff --git a/testcases/t/19-match.t b/testcases/t/19-match.t

index 4d38c41fb5b408d77fd2683e0b9d1a9164733262..e4fc6ec0225bbbd5116c6c8a7fac26676e2ed86e 100644 (file)
--- a/testcases/t/19-match.t
+++ b/testcases/t/19-match.t
@@ -139,7 +139,7 @@ sleep 0.25;
  
  # two windows should be here
  $content = get_ws_content($tmp);
-ok(@{$content} == 1, 'two windows opened');
+ok(@{$content} == 1, 'window opened');
  
  cmd '[class="^special[0-9]$"] kill';
  
@@ -148,5 +148,33 @@ sleep 0.25;
  $content = get_ws_content($tmp);
  is(@{$content}, 0, 'window killed');
  
+######################################################################
+# check that UTF-8 works when matching
+######################################################################
+
+$tmp = fresh_workspace;
+
+$left = $x->root->create_child(
+    class => WINDOW_CLASS_INPUT_OUTPUT,
+    rect => [ 0, 0, 30, 30 ],
+    background_color => '#0000ff',
+);
+
+$left->_create;
+set_wm_class($left->id, 'special7', 'special7');
+$left->name('ä 3');
+$left->map;
+sleep 0.25;
+
+# two windows should be here
+$content = get_ws_content($tmp);
+ok(@{$content} == 1, 'window opened');
+
+cmd '[title="^\w [3]$"] kill';
+
+sleep 0.25;
+
+$content = get_ws_content($tmp);
+is(@{$content}, 0, 'window killed');
  
  done_testing;
author	Michael Stapelberg <michael@stapelberg.de>
	Sun, 11 Sep 2011 10:40:51 +0000 (11:40 +0100)
committer	Michael Stapelberg <michael@stapelberg.de>
	Sun, 11 Sep 2011 10:40:51 +0000 (11:40 +0100)
src/regex.c		patch \| blob \| history
testcases/t/19-match.t		patch \| blob \| history