1 /* phonetic.c - routines to do phonetic matching */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 * Copyright 1998-2004 The OpenLDAP Foundation.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
16 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
17 * All rights reserved.
19 * Redistribution and use in source and binary forms are permitted
20 * provided that this notice is preserved and that due credit is given
21 * to the University of Michigan at Ann Arbor. The name of the University
22 * may not be used to endorse or promote products derived from this
23 * software without specific prior written permission. This software
24 * is provided ``as is'' without express or implied warranty.
32 #include <ac/string.h>
33 #include <ac/socket.h>
38 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
39 #define SLAPD_METAPHONE
42 #define iswordbreak(x) (!isascii(x) || isspace((unsigned char) (x)) || \
43 ispunct((unsigned char) (x)) || \
44 isdigit((unsigned char) (x)) || (x) == '\0')
53 while ( iswordbreak( *s ) ) {
71 while ( ! iswordbreak( *s ) ) {
75 while ( iswordbreak( *s ) ) {
92 for ( s = w; !iswordbreak( *s ); s++ )
102 #ifndef MAXPHONEMELEN
103 #define MAXPHONEMELEN 4
106 #if defined(SLAPD_PHONETIC)
108 /* lifted from isode-8.0 */
112 char code, adjacent, ch;
115 char phoneme[MAXPHONEMELEN + 1];
118 if ( p == NULL || *p == '\0' ) {
123 phoneme[0] = TOUPPER((unsigned char)*p);
126 for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
127 ch = TOUPPER ((unsigned char)*p);
136 code = (adjacent != '1') ? '1' : '0';
146 code = (adjacent != '2') ? '2' : '0';
150 code = (adjacent != '3') ? '3' : '0';
153 code = (adjacent != '4') ? '4' : '0';
157 code = (adjacent != '5') ? '5' : '0';
160 code = (adjacent != '6') ? '6' : '0';
169 } else if ( code != '0' ) {
170 if ( i == MAXPHONEMELEN )
172 adjacent = phoneme[i] = code;
180 return( ch_strdup( phoneme ) );
183 #elif defined(SLAPD_METAPHONE)
186 * Metaphone was originally developed by Lawrence Philips and
187 * published in the "Computer Language" magazine in 1990.
190 * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
191 * author Gary A. Parker, with changes by Bernard Tiffany of the
192 * University of Michigan, and more changes by Tim Howes of the
193 * University of Michigan.
196 /* Character coding array */
197 static const char vsvfn[26] = {
198 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
199 /* A B C D E F G H I J K L M */
200 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
201 /* N O P Q R S T U V W X Y Z */
203 /* Macros to access character coding array */
204 #define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */
205 #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
206 #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
207 #define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */
208 #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
211 phonetic( char *Word )
213 char *n, *n_start, *n_end; /* pointers to string */
214 char *metaph_end; /* pointers to metaph */
215 char ntrans[40]; /* word with uppercase letters */
216 int KSflag; /* state flag for X -> KS */
217 char buf[MAXPHONEMELEN + 2];
221 * Copy Word to internal buffer, dropping non-alphabetic characters
222 * and converting to upper case
225 for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
227 if (isalpha((unsigned char)*Word))
228 *n++ = TOUPPER((unsigned char)*Word);
232 if (n == ntrans + 4) {
233 return( ch_strdup( buf ) ); /* Return if null */
235 n_end = n; /* Set n_end to end of string */
237 /* ntrans[0] will always be == 0 */
245 *n = 0; /* Pad with nulls */
246 n = ntrans + 4; /* Assign pointer to start */
248 /* Check for PN, KN, GN, AE, WR, WH, and X at start */
253 /* 'PN', 'KN', 'GN' becomes 'N' */
258 /* 'AE' becomes 'E' */
263 /* 'WR' becomes 'R', and 'WH' to 'H' */
266 else if (*(n + 1) == 'H') {
272 /* 'X' becomes 'S' */
278 * Now, loop step through string, stopping at end of string or when
279 * the computed 'metaph' is MAXPHONEMELEN characters long
282 KSflag = 0; /* state flag for KS translation */
283 for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
284 n <= n_end && Metaph < metaph_end; n++) {
289 /* Drop duplicates except for CC */
290 if (*(n - 1) == *n && *n != 'C')
292 /* Check for F J L M N R or first letter vowel */
293 if (same(*n) || (n == n_start && vowel(*n)))
302 if (n == (n_end - 1) && *(n - 1) != 'M')
308 * X if in -CIA-, -CH- else S if in
309 * -CI-, -CE-, -CY- else dropped if
310 * in -SCI-, -SCE-, -SCY- else K
312 if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
313 if (*(n + 1) == 'I' && *(n + 2) == 'A')
315 else if (frontv(*(n + 1)))
317 else if (*(n + 1) == 'H')
318 *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
320 ? (char) 'K' : (char) 'X';
328 * J if in DGE or DGI or DGY else T
330 *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
331 ? (char) 'J' : (char) 'T';
336 * F if in -GH and not B--GH, D--GH,
337 * -H--GH, -H---GH else dropped if
338 * -GNED, -GN, -DGE-, -DGI-, -DGY-
339 * else J if in -GE-, -GI-, -GY- and
342 if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
343 (*(n + 1) != 'N' || ((n + 1) < n_end &&
344 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
345 (*(n - 1) != 'D' || !frontv(*(n + 1))))
346 *Metaph++ = (frontv(*(n + 1)) &&
347 *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
348 else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
355 * H if before a vowel and not after
356 * C, G, P, S, T else dropped
358 if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
365 * dropped if after C else K
373 * F if before H, else P
375 *Metaph++ = *(n + 1) == 'H' ?
376 (char) 'F' : (char) 'P';
388 * X in -SH-, -SIO- or -SIA- else S
390 *Metaph++ = (*(n + 1) == 'H' ||
391 (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
393 ? (char) 'X' : (char) 'S';
398 * X in -TIA- or -TIO- else 0 (zero)
399 * before H else dropped if in -TCH-
402 if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
405 else if (*(n + 1) == 'H')
407 else if (*(n + 1) != 'C' || *(n + 2) != 'H')
420 * W after a vowel, else dropped
425 * Y unless followed by a vowel
438 *Metaph++ = 'K'; /* Insert K, then S */
453 *Metaph = 0; /* Null terminate */
454 return( ch_strdup( buf ) );
457 #endif /* SLAPD_METAPHONE */