]> git.sur5r.net Git - openldap/blob - servers/slapd/phonetic.c
Add ldap_*2name() in <include,libldap>/schema, use them in slapd/schema
[openldap] / servers / slapd / phonetic.c
1 /* phonetic.c - routines to do phonetic matching */
2 /*
3  * Copyright 1998-1999 The OpenLDAP Foundation, All Rights Reserved.
4  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
5  */
6
7 #include "portable.h"
8
9 #include <stdio.h>
10
11 #include <ac/ctype.h>
12 #include <ac/string.h>
13 #include <ac/socket.h>
14 #include <ac/time.h>
15
16 #include "slap.h"
17
18 #if !defined(METAPHONE) && !defined(SLAPD_PHONETIC)
19 #define METAPHONE
20 #endif
21
22 #define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
23                          ispunct((unsigned char) (x)) || \
24                          isdigit((unsigned char) (x)) || (x) == '\0')
25
26 char *
27 first_word( char *s )
28 {
29         if ( s == NULL ) {
30                 return( NULL );
31         }
32
33         while ( iswordbreak( *s ) ) {
34                 if ( *s == '\0' ) {
35                         return( NULL );
36                 } else {
37                         s++;
38                 }
39         }
40
41         return( s );
42 }
43
44 char *
45 next_word( char *s )
46 {
47         if ( s == NULL ) {
48                 return( NULL );
49         }
50
51         while ( ! iswordbreak( *s ) ) {
52                 s++;
53         }
54
55         while ( iswordbreak( *s ) ) {
56                 if ( *s == '\0' ) {
57                         return( NULL );
58                 } else {
59                         s++;
60                 }
61         }
62
63         return( s );
64 }
65
66 char *
67 word_dup( char *w )
68 {
69         char    *s, *ret;
70         char    save;
71
72         for ( s = w; !iswordbreak( *s ); s++ )
73                 ;       /* NULL */
74         save = *s;
75         *s = '\0';
76         ret = ch_strdup( w );
77         *s = save;
78
79         return( ret );
80 }
81
82 #ifndef MAXPHONEMELEN
83 #define MAXPHONEMELEN   4
84 #endif
85
86 #if defined(SLAPD_PHONETIC)
87
88 /* lifted from isode-8.0 */
89 char *
90 phonetic( char *s )
91 {
92         char    code, adjacent, ch;
93         char    *p;
94         int     i;
95         char    phoneme[MAXPHONEMELEN + 1];
96
97         p = s;
98         if (  p == NULL || *p == '\0' ) {
99                 return( NULL );
100         }
101
102         adjacent = '0';
103         phoneme[0] = TOUPPER((unsigned char)*p);
104
105         phoneme[1]  = '\0';
106         for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
107                 ch = TOUPPER ((unsigned char)*p);
108
109                 code = '0';
110
111                 switch (ch) {
112                 case 'B':
113                 case 'F':
114                 case 'P':
115                 case 'V':
116                         code = (adjacent != '1') ? '1' : '0';
117                         break;
118                 case 'S':
119                 case 'C':
120                 case 'G':
121                 case 'J':
122                 case 'K':
123                 case 'Q':
124                 case 'X':
125                 case 'Z':
126                         code = (adjacent != '2') ? '2' : '0';
127                         break;
128                 case 'D':
129                 case 'T':
130                         code = (adjacent != '3') ? '3' : '0';
131                         break;
132                 case 'L':
133                         code = (adjacent != '4') ? '4' : '0';
134                         break;
135                 case 'M':
136                 case 'N':
137                         code = (adjacent != '5') ? '5' : '0';
138                         break;
139                 case 'R':
140                         code = (adjacent != '6') ? '6' : '0';
141                         break;
142                 default:
143                         adjacent = '0';
144                 }
145
146                 if ( i == 0 ) {
147                         adjacent = code;
148                         i++;
149                 } else if ( code != '0' ) {
150                         if ( i == MAXPHONEMELEN )
151                                 break;
152                         adjacent = phoneme[i] = code;
153                         i++;
154                 }
155         }
156
157         if ( i > 0 )
158                 phoneme[i] = '\0';
159
160         return( ch_strdup( phoneme ) );
161 }
162
163 #else
164 #if defined(METAPHONE)
165
166 /*
167  * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
168  * author Gary A. Parker, with changes by Bernard Tiffany of the
169  * University of Michigan, and more changes by Tim Howes of the
170  * University of Michigan.
171  */
172
173 /* Character coding array */
174 static const char  vsvfn[26] = {
175            1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
176         /* A   B  C   D  E  F  G   H  I  J  K  L  M  */
177            2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
178         /* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
179
180 /* Macros to access character coding array */
181 #define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)   /* AEIOU */
182 #define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)   /* FJLMNR */
183 #define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)   /* CGPST */
184 #define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)   /* EIY */
185 #define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16)  /* BDH */
186
187 char *
188 phonetic( char *Word )
189 {
190         char           *n, *n_start, *n_end;    /* pointers to string */
191         char           *metaph_end;     /* pointers to metaph */
192         char            ntrans[40];     /* word with uppercase letters */
193         int             KSflag; /* state flag for X -> KS */
194         char            buf[MAXPHONEMELEN + 2];
195         char            *Metaph;
196
197         /*
198          * Copy Word to internal buffer, dropping non-alphabetic characters
199          * and converting to upper case
200          */
201
202         for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
203             n < n_end; Word++) {
204                 if (isalpha((unsigned char)*Word))
205                         *n++ = TOUPPER((unsigned char)*Word);
206         }
207         Metaph = buf;
208         *Metaph = '\0';
209         if (n == ntrans + 4) {
210                 return( ch_strdup( buf ) );             /* Return if null */
211         }
212         n_end = n;              /* Set n_end to end of string */
213
214         /* ntrans[0] will always be == 0 */
215         ntrans[0] = '\0';
216         ntrans[1] = '\0';
217         ntrans[2] = '\0';
218         ntrans[3] = '\0';
219         *n++ = 0;
220         *n++ = 0;
221         *n++ = 0;
222         *n = 0;                 /* Pad with nulls */
223         n = ntrans + 4;         /* Assign pointer to start */
224
225         /* Check for PN, KN, GN, AE, WR, WH, and X at start */
226         switch (*n) {
227         case 'P':
228         case 'K':
229         case 'G':
230                 /* 'PN', 'KN', 'GN' becomes 'N' */
231                 if (*(n + 1) == 'N')
232                         *n++ = 0;
233                 break;
234         case 'A':
235                 /* 'AE' becomes 'E' */
236                 if (*(n + 1) == 'E')
237                         *n++ = 0;
238                 break;
239         case 'W':
240                 /* 'WR' becomes 'R', and 'WH' to 'H' */
241                 if (*(n + 1) == 'R')
242                         *n++ = 0;
243                 else if (*(n + 1) == 'H') {
244                         *(n + 1) = *n;
245                         *n++ = 0;
246                 }
247                 break;
248         case 'X':
249                 /* 'X' becomes 'S' */
250                 *n = 'S';
251                 break;
252         }
253
254         /*
255          * Now, loop step through string, stopping at end of string or when
256          * the computed 'metaph' is MAXPHONEMELEN characters long
257          */
258
259         KSflag = 0;             /* state flag for KS translation */
260         for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
261              n <= n_end && Metaph < metaph_end; n++) {
262                 if (KSflag) {
263                         KSflag = 0;
264                         *Metaph++ = 'S';
265                 } else {
266                         /* Drop duplicates except for CC */
267                         if (*(n - 1) == *n && *n != 'C')
268                                 continue;
269                         /* Check for F J L M N R or first letter vowel */
270                         if (same(*n) || (n == n_start && vowel(*n)))
271                                 *Metaph++ = *n;
272                         else
273                                 switch (*n) {
274                                 case 'B':
275
276                                         /*
277                                          * B unless in -MB
278                                          */
279                                         if (n == (n_end - 1) && *(n - 1) != 'M')
280                                                 *Metaph++ = *n;
281                                         break;
282                                 case 'C':
283
284                                         /*
285                                          * X if in -CIA-, -CH- else S if in
286                                          * -CI-, -CE-, -CY- else dropped if
287                                          * in -SCI-, -SCE-, -SCY- else K
288                                          */
289                                         if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
290                                                 if (*(n + 1) == 'I' && *(n + 2) == 'A')
291                                                         *Metaph++ = 'X';
292                                                 else if (frontv(*(n + 1)))
293                                                         *Metaph++ = 'S';
294                                                 else if (*(n + 1) == 'H')
295                                                         *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
296                                                          || *(n - 1) == 'S')
297                                                             ? (char) 'K' : (char) 'X';
298                                                 else
299                                                         *Metaph++ = 'K';
300                                         }
301                                         break;
302                                 case 'D':
303
304                                         /*
305                                          * J if in DGE or DGI or DGY else T
306                                          */
307                                         *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
308                                             ? (char) 'J' : (char) 'T';
309                                         break;
310                                 case 'G':
311
312                                         /*
313                                          * F if in -GH and not B--GH, D--GH,
314                                          * -H--GH, -H---GH else dropped if
315                                          * -GNED, -GN, -DGE-, -DGI-, -DGY-
316                                          * else J if in -GE-, -GI-, -GY- and
317                                          * not GG else K
318                                          */
319                                         if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
320                                             (*(n + 1) != 'N' || ((n + 1) < n_end &&
321                                                                  (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
322                                             (*(n - 1) != 'D' || !frontv(*(n + 1))))
323                                                 *Metaph++ = (frontv(*(n + 1)) &&
324                                                              *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
325                                         else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
326                                                  *(n - 4) != 'H')
327                                                 *Metaph++ = 'F';
328                                         break;
329                                 case 'H':
330
331                                         /*
332                                          * H if before a vowel and not after
333                                          * C, G, P, S, T else dropped
334                                          */
335                                         if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
336                                                            vowel(*(n + 1))))
337                                                 *Metaph++ = 'H';
338                                         break;
339                                 case 'K':
340
341                                         /*
342                                          * dropped if after C else K
343                                          */
344                                         if (*(n - 1) != 'C')
345                                                 *Metaph++ = 'K';
346                                         break;
347                                 case 'P':
348
349                                         /*
350                                          * F if before H, else P
351                                          */
352                                         *Metaph++ = *(n + 1) == 'H' ?
353                                             (char) 'F' : (char) 'P';
354                                         break;
355                                 case 'Q':
356
357                                         /*
358                                          * K
359                                          */
360                                         *Metaph++ = 'K';
361                                         break;
362                                 case 'S':
363
364                                         /*
365                                          * X in -SH-, -SIO- or -SIA- else S
366                                          */
367                                         *Metaph++ = (*(n + 1) == 'H' ||
368                                                      (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
369                                                           *(n + 2) == 'A')))
370                                             ? (char) 'X' : (char) 'S';
371                                         break;
372                                 case 'T':
373
374                                         /*
375                                          * X in -TIA- or -TIO- else 0 (zero)
376                                          * before H else dropped if in -TCH-
377                                          * else T
378                                          */
379                                         if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
380                                                            *(n + 2) == 'A'))
381                                                 *Metaph++ = 'X';
382                                         else if (*(n + 1) == 'H')
383                                                 *Metaph++ = '0';
384                                         else if (*(n + 1) != 'C' || *(n + 2) != 'H')
385                                                 *Metaph++ = 'T';
386                                         break;
387                                 case 'V':
388
389                                         /*
390                                          * F
391                                          */
392                                         *Metaph++ = 'F';
393                                         break;
394                                 case 'W':
395
396                                         /*
397                                          * W after a vowel, else dropped
398                                          */
399                                 case 'Y':
400
401                                         /*
402                                          * Y unless followed by a vowel
403                                          */
404                                         if (vowel(*(n + 1)))
405                                                 *Metaph++ = *n;
406                                         break;
407                                 case 'X':
408
409                                         /*
410                                          * KS
411                                          */
412                                         if (n == n_start)
413                                                 *Metaph++ = 'S';
414                                         else {
415                                                 *Metaph++ = 'K';        /* Insert K, then S */
416                                                 KSflag = 1;
417                                         }
418                                         break;
419                                 case 'Z':
420
421                                         /*
422                                          * S
423                                          */
424                                         *Metaph++ = 'S';
425                                         break;
426                                 }
427                 }
428         }
429
430         *Metaph = 0;            /* Null terminate */
431         return( ch_strdup( buf ) );
432 }
433
434 #endif /* metaphone */
435 #endif /* SLAPD_PHONETIC */