git.sur5r.net Git - openldap/blob - libraries/liblunicode/utbm/utbm.c

   1 /*
   2  * Copyright 1997, 1998, 1999 Computing Research Labs,
   3  * New Mexico State University
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23 #ifndef lint
  24 static char rcsid[] = "$Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $";
  25 #endif
  26
  27 /*
  28  * Assumptions:
  29  * 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
  30  * 2. Case conversions are all one-to-one.
  31  * 3. Text and pattern have already been normalized in some fashion.
  32  */
  33
  34 #include <stdlib.h>
  35 #include <unistd.h>
  36 #include <string.h>
  37 #include "utbm.h"
  38
  39 /*
  40  * Single pattern character.
  41  */
  42 typedef struct {
  43     ucs4_t lc;
  44     ucs4_t uc;
  45     ucs4_t tc;
  46 } _utbm_char_t;
  47
  48 typedef struct {
  49     _utbm_char_t *ch;
  50     unsigned long skip;
  51 } _utbm_skip_t;
  52
  53 typedef struct _utbm_pattern_t {
  54     unsigned long flags;
  55
  56     _utbm_char_t *pat;
  57     unsigned long pat_used;
  58     unsigned long pat_size;
  59     unsigned long patlen;
  60
  61     _utbm_skip_t *skip;
  62     unsigned long skip_used;
  63     unsigned long skip_size;
  64
  65     unsigned long md4;
  66 } _utbm_pattern_t;
  67
  68 /*************************************************************************
  69  *
  70  * Support functions.
  71  *
  72  *************************************************************************/
  73
  74 /*
  75  * Routine to look up the skip value for a character.
  76  */
  77 static unsigned long
  78 _utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
  79 {
  80     unsigned long i;
  81     ucs4_t c1, c2;
  82     _utbm_skip_t *sp;
  83
  84     if (start >= end)
  85       return 0;
  86
  87     c1 = *start;
  88     c2 = (start + 1 < end) ? *(start + 1) : ~0;
  89     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
  90       c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
  91
  92     for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
  93         if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
  94             return ((unsigned long) (end - start) < sp->skip) ?
  95                 end - start : sp->skip;
  96         }
  97     }
  98     return p->patlen;
  99 }
 100
 101 static int
 102 _utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
 103             unsigned long *match_start, unsigned long *match_end)
 104 {
 105     int check_space;
 106     ucs4_t c1, c2;
 107     unsigned long count;
 108     _utbm_char_t *cp;
 109
 110     /*
 111      * Set the potential match endpoint first.
 112      */
 113     *match_end = (start - text) + 1;
 114
 115     c1 = *start;
 116     c2 = (start + 1 < end) ? *(start + 1) : ~0;
 117     if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
 118         c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
 119         /*
 120          * Adjust the match end point to occur after the UTF-16 character.
 121          */
 122         *match_end = *match_end + 1;
 123     }
 124
 125     if (pat->pat_used == 1) {
 126         *match_start = start - text;
 127         return 1;
 128     }
 129
 130     /*
 131      * Compare backward.
 132      */
 133     cp = pat->pat + (pat->pat_used - 1);
 134
 135     for (count = pat->patlen; start > text && count > 0;) {
 136         /*
 137          * Ignore non-spacing characters if indicated.
 138          */
 139         if (pat->flags & UTBM_IGNORE_NONSPACING) {
 140             while (start > text && _utbm_nonspacing(c1)) {
 141                 c2 = *--start;
 142                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
 143                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
 144                     0xd800 <= c1 && c1 <= 0xdbff) {
 145                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
 146                     start--;
 147                 } else
 148                   c1 = c2;
 149             }
 150         }
 151
 152         /*
 153          * Handle space compression if indicated.
 154          */
 155         if (pat->flags & UTBM_SPACE_COMPRESS) {
 156             check_space = 0;
 157             while (start > text &&
 158                    (_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
 159                 check_space = _utbm_isspace(c1, 1);
 160                 c2 = *--start;
 161                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
 162                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
 163                     0xd800 <= c1 && c1 <= 0xdbff) {
 164                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
 165                     start--;
 166                 } else
 167                   c1 = c2;
 168             }
 169             /*
 170              * Handle things if space compression was indicated and one or
 171              * more member characters were found.
 172              */
 173             if (check_space) {
 174                 if (cp->uc != ' ')
 175                   return 0;
 176                 cp--;
 177                 count--;
 178             }
 179         }
 180
 181         /*
 182          * Handle the normal comparison cases.
 183          */
 184         if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
 185           return 0;
 186
 187         count -= (c1 >= 0x10000) ? 2 : 1;
 188         if (count > 0) {
 189             cp--;
 190
 191             /*
 192              * Get the next preceding character.
 193              */
 194             if (start > text) {
 195                 c2 = *--start;
 196                 c1 = (start - 1 > text) ? *(start - 1) : ~0;
 197                 if (0xdc00 <= c2 && c2 <= 0xdfff &&
 198                     0xd800 <= c1 && c1 <= 0xdbff) {
 199                     c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
 200                     start--;
 201                 } else
 202                   c1 = c2;
 203             }
 204         }
 205     }
 206
 207     /*
 208      * Set the match start position.
 209      */
 210     *match_start = start - text;
 211     return 1;
 212 }
 213
 214 /*************************************************************************
 215  *
 216  * API.
 217  *
 218  *************************************************************************/
 219
 220 utbm_pattern_t
 221 utbm_create_pattern(void)
 222 {
 223     utbm_pattern_t p;
 224
 225     p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
 226     (void) memset((char *) p, 0, sizeof(_utbm_pattern_t));
 227     return p;
 228 }
 229
 230 void
 231 utbm_free_pattern(utbm_pattern_t pattern)
 232 {
 233     if (pattern == 0)
 234       return;
 235
 236     if (pattern->pat_size > 0)
 237       free((char *) pattern->pat);
 238
 239     if (pattern->skip_size > 0)
 240       free((char *) pattern->skip);
 241
 242     free((char *) pattern);
 243 }
 244
 245 void
 246 utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
 247              utbm_pattern_t p)
 248 {
 249     int have_space;
 250     unsigned long i, j, k, slen;
 251     _utbm_char_t *cp;
 252     _utbm_skip_t *sp;
 253     ucs4_t c1, c2, sentinel;
 254
 255     if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
 256       return;
 257
 258     /*
 259      * Reset the pattern buffer.
 260      */
 261     p->patlen = p->pat_used = p->skip_used = 0;
 262
 263     /*
 264      * Set the flags.
 265      */
 266     p->flags = flags;
 267
 268     /*
 269      * Initialize the extra skip flag.
 270      */
 271     p->md4 = 1;
 272
 273     /*
 274      * Allocate more storage if necessary.
 275      */
 276     if (patlen > p->pat_size) {
 277         if (p->pat_size == 0) {
 278             p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
 279             p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
 280         } else {
 281             p->pat = (_utbm_char_t *)
 282                 realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
 283             p->skip = (_utbm_skip_t *)
 284                 realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
 285         }
 286         p->pat_size = p->skip_size = patlen;
 287     }
 288
 289     /*
 290      * Preprocess the pattern to remove controls (if specified) and determine
 291      * case.
 292      */
 293     for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
 294         c1 = pat[i];
 295         c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
 296         if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
 297           c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
 298
 299         /*
 300          * Make sure the `have_space' flag is turned off if the character
 301          * is not an appropriate one.
 302          */
 303         if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
 304           have_space = 0;
 305
 306         /*
 307          * If non-spacing characters should be ignored, do it here.
 308          */
 309         if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
 310           continue;
 311
 312         /*
 313          * Check if spaces and controls need to be compressed.
 314          */
 315         if (flags & UTBM_SPACE_COMPRESS) {
 316             if (_utbm_isspace(c1, 1)) {
 317                 if (!have_space) {
 318                     /*
 319                      * Add a space and set the flag.
 320                      */
 321                     cp->uc = cp->lc = cp->tc = ' ';
 322                     cp++;
 323
 324                     /*
 325                      * Increase the real pattern length.
 326                      */
 327                     p->patlen++;
 328                     sentinel = ' ';
 329                     have_space = 1;
 330                 }
 331                 continue;
 332             }
 333
 334             /*
 335              * Ignore all control characters.
 336              */
 337             if (_utbm_iscntrl(c1))
 338               continue;
 339         }
 340
 341         /*
 342          * Add the character.
 343          */
 344         if (flags & UTBM_CASEFOLD) {
 345             cp->uc = _utbm_toupper(c1);
 346             cp->lc = _utbm_tolower(c1);
 347             cp->tc = _utbm_totitle(c1);
 348         } else
 349           cp->uc = cp->lc = cp->tc = c1;
 350
 351         /*
 352          * Set the sentinel character.
 353          */
 354         sentinel = cp->uc;
 355
 356         /*
 357          * Move to the next character.
 358          */
 359         cp++;
 360
 361         /*
 362          * Increase the real pattern length appropriately.
 363          */
 364         p->patlen += (c1 >= 0x10000) ? 2 : 1;
 365
 366         /*
 367          * Increment the loop index for UTF-16 characters.
 368          */
 369         i += (c1 >= 0x10000) ? 1 : 0;
 370
 371     }
 372
 373     /*
 374      * Set the number of characters actually used.
 375      */
 376     p->pat_used = cp - p->pat;
 377
 378     /*
 379      * Go through and construct the skip array and determine the actual length
 380      * of the pattern in UCS2 terms.
 381      */
 382     slen = p->patlen - 1;
 383     cp = p->pat;
 384     for (i = k = 0; i < p->pat_used; i++, cp++) {
 385         /*
 386          * Locate the character in the skip array.
 387          */
 388         for (sp = p->skip, j = 0;
 389              j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
 390
 391         /*
 392          * If the character is not found, set the new skip element and
 393          * increase the number of skip elements.
 394          */
 395         if (j == p->skip_used) {
 396             sp->ch = cp;
 397             p->skip_used++;
 398         }
 399
 400         /*
 401          * Set the updated skip value.  If the character is UTF-16 and is
 402          * not the last one in the pattern, add one to its skip value.
 403          */
 404         sp->skip = slen - k;
 405         if (cp->uc >= 0x10000 && k + 2 < slen)
 406           sp->skip++;
 407
 408         /*
 409          * Set the new extra skip for the sentinel character.
 410          */
 411         if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
 412             cp->uc == sentinel)
 413           p->md4 = slen - k;
 414
 415         /*
 416          * Increase the actual index.
 417          */
 418         k += (cp->uc >= 0x10000) ? 2 : 1;
 419     }
 420 }
 421
 422 int
 423 utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
 424           unsigned long *match_start, unsigned long *match_end)
 425 {
 426     unsigned long k;
 427     ucs2_t *start, *end;
 428
 429     if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
 430         textlen < pat->patlen)
 431       return 0;
 432
 433     start = text + pat->patlen;
 434     end = text + textlen;
 435
 436     /*
 437      * Adjust the start point if it points to a low surrogate.
 438      */
 439     if (0xdc00 <= *start && *start <= 0xdfff &&
 440         0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
 441       start--;
 442
 443     while (start < end) {
 444         while ((k = _utbm_skip(pat, start, end))) {
 445             start += k;
 446             if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
 447                 0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
 448               start--;
 449         }
 450
 451         if (start < end &&
 452             _utbm_match(pat, text, start, end, match_start, match_end))
 453           return 1;
 454
 455         start += pat->md4;
 456         if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
 457             0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
 458           start--;
 459     }
 460     return 0;
 461 }