git.sur5r.net Git - openldap/blob - libraries/liblunicode/ucstr.c

   1 #include "portable.h"
   2
   3 #include <ldap_pvt_uc.h>
   4
   5 #include <ac/ctype.h>
   6 #include <ac/string.h>
   7 #include <ac/stdlib.h>
   8
   9 int ucstrncmp(
  10         const ldap_unicode_t *u1,
  11         const ldap_unicode_t *u2,
  12         ber_len_t n )
  13 {
  14         for(; 0 < n; ++u1, ++u2, --n ) {
  15                 if( *u1 != *u2 ) {
  16                         return *u1 < *u2 ? -1 : +1;
  17                 }
  18                 if ( *u1 == 0 ) {
  19                         return 0;
  20                 }
  21         }
  22         return 0;
  23 }
  24
  25 int ucstrncasecmp(
  26         const ldap_unicode_t *u1,
  27         const ldap_unicode_t *u2,
  28         ber_len_t n )
  29 {
  30         for(; 0 < n; ++u1, ++u2, --n ) {
  31                 ldap_unicode_t uu1 = uctoupper( *u1 );
  32                 ldap_unicode_t uu2 = uctoupper( *u2 );
  33
  34                 if( uu1 != uu2 ) {
  35                         return uu1 < uu2 ? -1 : +1;
  36                 }
  37                 if ( uu1 == 0 ) {
  38                         return 0;
  39                 }
  40         }
  41         return 0;
  42 }
  43
  44 ldap_unicode_t * ucstrnchr(
  45         const ldap_unicode_t *u,
  46         ber_len_t n,
  47         ldap_unicode_t c )
  48 {
  49         for(; 0 < n; ++u, --n ) {
  50                 if( *u == c ) {
  51                         return (ldap_unicode_t *) u;
  52                 }
  53         }
  54
  55         return NULL;
  56 }
  57
  58 ldap_unicode_t * ucstrncasechr(
  59         const ldap_unicode_t *u,
  60         ber_len_t n,
  61         ldap_unicode_t c )
  62 {
  63         c = uctoupper( c );
  64         for(; 0 < n; ++u, --n ) {
  65                 if( uctoupper( *u ) == c ) {
  66                         return (ldap_unicode_t *) u;
  67                 }
  68         }
  69
  70         return NULL;
  71 }
  72
  73 void ucstr2upper(
  74         ldap_unicode_t *u,
  75         ber_len_t n )
  76 {
  77         for(; 0 < n; ++u, --n ) {
  78                 *u = uctoupper( *u );
  79         }
  80 }
  81
  82 char * UTF8normalize(
  83         const char *s,
  84         char casefold )
  85 {
  86         int i, j, len, clen, outpos, ucsoutlen, outsize, last;
  87         char *out;
  88         unsigned long *ucs, *p, *ucsout;
  89
  90         static unsigned char mask[] = {
  91                 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  92
  93         if ( s == NULL ) {
  94                 return NULL;
  95         }
  96
  97         len = strlen( s );
  98
  99         if ( len == 0 ) {
 100                 out = (char *) malloc( 1 );
 101                 *out = '\0';
 102                 return out;
 103         }
 104
 105         outsize = len + 7;
 106         out = (char *) malloc( outsize );
 107         if ( out == NULL ) {
 108                 return NULL;
 109         }
 110
 111         outpos = 0;
 112
 113         /* finish off everything up to character before first non-ascii */
 114         if ( LDAP_UTF8_ISASCII( s ) ) {
 115                 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 116                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 117                 }
 118                 if ( i == len ) {
 119                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 120                         out[outpos] = '\0';
 121                         return out;
 122                 }
 123         } else {
 124                 i = 0;
 125         }
 126
 127         p = ucs = (long *) malloc( len * sizeof(*ucs) );
 128         if ( ucs == NULL ) {
 129                 free(out);
 130                 return NULL;
 131         }
 132
 133         /* convert character before first non-ascii to ucs-4 */
 134         if ( i > 0 ) {
 135                 *p = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 136                 p++;
 137         }
 138
 139         /* s[i] is now first non-ascii character */
 140         for (;;) {
 141                 /* s[i] is non-ascii */
 142                 /* convert everything up to next ascii to ucs-4 */
 143                 while ( i < len ) {
 144                         clen = LDAP_UTF8_CHARLEN( s + i );
 145                         if ( clen == 0 ) {
 146                                 free( ucs );
 147                                 free( out );
 148                                 return NULL;
 149                         }
 150                         if ( clen == 1 ) {
 151                                 /* ascii */
 152                                 break;
 153                         }
 154                         *p = s[i] & mask[clen];
 155                         i++;
 156                         for( j = 1; j < clen; j++ ) {
 157                                 if ( (s[i] & 0xc0) != 0x80 ) {
 158                                         free( ucs );
 159                                         free( out );
 160                                         return NULL;
 161                                 }
 162                                 *p <<= 6;
 163                                 *p |= s[i] & 0x3f;
 164                                 i++;
 165                         }
 166                         if ( casefold ) {
 167                                 *p = uctoupper( *p );
 168                         }
 169                         p++;
 170                 }
 171                 /* normalize ucs of length p - ucs */
 172                 uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
 173                 ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
 174                 /* convert ucs to utf-8 and store in out */
 175                 for ( j = 0; j < ucsoutlen; j++ ) {
 176                         /* allocate more space if not enough room for
 177                            6 bytes and terminator */
 178                         if ( outsize - outpos < 7 ) {
 179                                 outsize = ucsoutlen - j + outpos + 6;
 180                                 out = (char *) realloc( out, outsize );
 181                                 if ( out == NULL ) {
 182                                         free( ucs );
 183                                         return NULL;
 184                                 }
 185                         }
 186                         outpos += ldap_ucs4_to_utf8( ucsout[j], &out[outpos] );
 187                 }
 188
 189                 if ( i == len ) {
 190                         break;
 191                 }
 192
 193                 last = i;
 194
 195                 /* s[i] is ascii */
 196                 /* finish off everything up to char before next non-ascii */
 197                 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
 198                         out[outpos++] = casefold ? TOUPPER( s[i-1] ) : s[i-1];
 199                 }
 200                 if ( i == len ) {
 201                         out[outpos++] = casefold ? TOUPPER( s[len - 1] ) : s[len - 1];
 202                         break;
 203                 }
 204
 205                 /* convert character before next non-ascii to ucs-4 */
 206                 *ucs = casefold ? TOUPPER( s[i - 1] ) : s[i - 1];
 207                 p = ucs + 1;
 208         }
 209         free( ucs );
 210         out[outpos] = '\0';
 211         return out;
 212 }
 213
 214 /* compare UTF8-strings, optionally ignore casing, string pointers must not be NULL */
 215 /* slow, should be optimized */
 216 int UTF8normcmp(
 217         const char *s1,
 218         const char *s2,
 219         char casefold )
 220 {
 221         int i, l1, l2, len, ulen, res;
 222         unsigned long *ucs, *ucsout1, *ucsout2;
 223
 224         l1 = strlen( s1 );
 225         l2 = strlen( s2 );
 226
 227         if ( ( l1 == 0 ) || ( l2 == 0 ) ) {
 228                 if ( l1 == l2 ) {
 229                         return 0;
 230                 }
 231                 return *s1 - *s2 > 0 ? 1 : -1;
 232         }
 233
 234         ucs = (long *) malloc( ( l1 > l2 ? l1 : l2 ) * sizeof(*ucs) );
 235         if ( ucs == NULL ) {
 236                 return l1 > l2 ? 1 : -1; /* what to do??? */
 237         }
 238
 239         /*
 240          * XXYYZ: we convert to ucs4 even though -llunicode
 241          * expects ucs2 in an unsigned long
 242          */
 243
 244         /* convert and normalize 1st string */
 245         for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
 246                 ucs[ulen] = ldap_utf8_to_ucs4( s1 + i );
 247                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 248                         free( ucs );
 249                         return -1; /* what to do??? */
 250                 }
 251                 len = LDAP_UTF8_CHARLEN( s1 + i );
 252         }
 253         uccanondecomp( ucs, ulen, &ucsout1, &l1 );
 254         l1 = uccanoncomp( ucsout1, l1 );
 255
 256         /* convert and normalize 2nd string */
 257         for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
 258                 ucs[ulen] = ldap_utf8_to_ucs4( s2 + i );
 259                 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
 260                         free( ucsout1 );
 261                         free( ucs );
 262                         return 1; /* what to do??? */
 263                 }
 264                 len = LDAP_UTF8_CHARLEN( s2 + i );
 265         }
 266         uccanondecomp( ucs, ulen, &ucsout2, &l2 );
 267         l2 = uccanoncomp( ucsout2, l2 );
 268
 269         free( ucs );
 270
 271         res = casefold
 272                 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
 273                 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
 274         free( ucsout1 );
 275         free( ucsout2 );
 276
 277         if ( res != 0 ) {
 278                 return res;
 279         }
 280         if ( l1 == l2 ) {
 281                 return 0;
 282         }
 283         return l1 > l2 ? 1 : -1;
 284 }