From 42cc5e5333c36b1bd7ab250dac3a1a54ec3c439a Mon Sep 17 00:00:00 2001
From: Kurt Zeilenga <kurt@openldap.org>
Date: Sun, 23 Jan 2000 23:07:24 +0000
Subject: [PATCH] Fix bugs in UTF-8 code.  Apply to getdn and charray.

---
 include/ldap_pvt.h          |  20 +++--
 libraries/libldap/charray.c |  12 +--
 libraries/libldap/getdn.c   | 141 +++++++++++++++++++++++-------------
 libraries/libldap/string.c  |   5 ++
 libraries/libldap/utf-8.c   |  80 ++++++++++++--------
 5 files changed, 161 insertions(+), 97 deletions(-)

diff --git a/include/ldap_pvt.h b/include/ldap_pvt.h
index 4509a0f396..bbfff50d4f 100644
--- a/include/ldap_pvt.h
+++ b/include/ldap_pvt.h
@@ -77,8 +77,8 @@ ldap_charray_dup LDAP_P(( char **a ));
 
 LIBLDAP_F( char ** )
 ldap_str2charray LDAP_P((
-	char *str,
-	char *brkstr ));
+	const char *str,
+	const char *brkstr ));
 
 /* url.c */
 LIBLDAP_F (void) ldap_pvt_hex_unescape LDAP_P(( char *s ));
@@ -152,7 +152,9 @@ LIBLDAP_F (int) ldap_pvt_tls_start LDAP_P(( Sockbuf *sb, void *ctx_arg ));
 LIBLDAP_F (ber_len_t) ldap_utf8_bytes( const char * );
 /* returns the number of UTF-8 characters in the string */
 LIBLDAP_F (ber_len_t) ldap_utf8_chars( const char * );
-/* returns the length (in bytes) of a UTF-8 string */
+/* returns the length (in bytes) of the UTF-8 character */
+LIBLDAP_F (int) ldap_utf8_offset( const char * );
+/* returns the length (in bytes) indicated by the UTF-8 character */
 LIBLDAP_F (int) ldap_utf8_charlen( const char * );
 /* copies a UTF-8 character and returning number of bytes copied */
 LIBLDAP_F (int) ldap_utf8_copy( char *, const char *);
@@ -174,6 +176,8 @@ LIBLDAP_F (int) ldap_utf8_isspace( const char * );
 LIBLDAP_F (ber_len_t) ldap_utf8_strcspn( const char* str, const char *set);
 /* span characters in set, return bytes spanned */
 LIBLDAP_F (ber_len_t) ldap_utf8_strspn( const char* str, const char *set);
+/* return first occurance of character in string */
+LIBLDAP_F (char *) ldap_utf8_strchr( const char* str, const char *chr);
 /* return first character of set in string */
 LIBLDAP_F (char *) ldap_utf8_strpbrk( const char* str, const char *set);
 /* reentrant tokenizer */
@@ -183,14 +187,16 @@ LIBLDAP_F (char*) ldap_utf8_strtok( char* sp, const char* sep, char **last);
 #define LDAP_UTF8_ISASCII(p) ( * (const unsigned char *) (p) < 0x100 )
 #define LDAP_UTF8_CHARLEN(p) ( LDAP_UTF8_ISASCII(p) \
 	? 1 : ldap_utf8_charlen((p)) )
+#define LDAP_UTF8_OFFSET(p) ( LDAP_UTF8_ISASCII(p) \
+	? 1 : ldap_utf8_offset((p)) )
 
-#define LDAP_UTF8_COPY(p) (	LDAP_UTF8_ISASCII(p) \
-	? (*(d) = *(s), 1) : ldap_utf8_cpy((d),(s)) )
+#define LDAP_UTF8_COPY(d,s) (	LDAP_UTF8_ISASCII(s) \
+	? (*(d) = *(s), 1) : ldap_utf8_copy((d),(s)) )
 
 #define LDAP_UTF8_NEXT(p) (	LDAP_UTF8_ISASCII(p) \
-	? &(p)[1] : ldap_utf8_next((p)) )
+	? (char *)(p)+1 : ldap_utf8_next((p)) )
 
-#define LDAP_UTF8_INCR(p) ( (p) = LDAP_UTF8_NEXT(p) )
+#define LDAP_UTF8_INCR(p) ((p) = LDAP_UTF8_NEXT(p))
 
 /* For symmetry */
 #define LDAP_UTF8_PREV(p) (ldap_utf8_prev((p)))
diff --git a/libraries/libldap/charray.c b/libraries/libldap/charray.c
index 6be3152427..ff6dd8351c 100644
--- a/libraries/libldap/charray.c
+++ b/libraries/libldap/charray.c
@@ -165,22 +165,22 @@ ldap_charray_dup( char **a )
 }
 
 char **
-ldap_str2charray( char *str, char *brkstr )
+ldap_str2charray( const char *str_in, const char *brkstr )
 {
 	char	**res;
-	char	*s;
+	char	*str, *s;
 	char	*lasts;
 	int	i;
 
 	/* protect the input string from strtok */
-	str = LDAP_STRDUP( str );
+	str = LDAP_STRDUP( str_in );
 	if( str == NULL ) {
 		return NULL;
 	}
 
 	i = 1;
 	for ( s = str; *s; s++ ) {
-		if ( strchr( brkstr, *s ) != NULL ) {
+		if ( ldap_utf8_strchr( brkstr, s ) != NULL ) {
 			i++;
 		}
 	}
@@ -194,9 +194,9 @@ ldap_str2charray( char *str, char *brkstr )
 
 	i = 0;
 
-	for ( s = ldap_pvt_strtok( str, brkstr, &lasts );
+	for ( s = ldap_utf8_strtok( str, brkstr, &lasts );
 		s != NULL;
-		s = ldap_pvt_strtok( NULL, brkstr, &lasts ) )
+		s = ldap_utf8_strtok( NULL, brkstr, &lasts ) )
 	{
 		res[i] = LDAP_STRDUP( s );
 
diff --git a/libraries/libldap/getdn.c b/libraries/libldap/getdn.c
index f03be3b455..01852c6ef8 100644
--- a/libraries/libldap/getdn.c
+++ b/libraries/libldap/getdn.c
@@ -23,11 +23,11 @@
 
 #include "ldap-int.h"
 
-#define DN_TYPE_LDAP_RDN	0
-#define DN_TYPE_LDAP_DN		1
-#define DN_TYPE_DCE_DN		2
+#define NAME_TYPE_LDAP_RDN	0
+#define NAME_TYPE_LDAP_DN	1
+#define NAME_TYPE_DCE_DN	2
 
-static char **explode_name( const char *name, int notypes, int is_dn );
+static char **explode_name( const char *name, int notypes, int is_type );
 
 char *
 ldap_get_dn( LDAP *ld, LDAPMessage *entry )
@@ -64,27 +64,28 @@ ldap_dn2ufn( LDAP_CONST char *dn )
 	}
 
 	if ( ldap_is_dns_dn( dn ) ||
-		( p = strchr( dn, '=' ) ) == NULL )
+		( p = ldap_utf8_strpbrk( dn, "=" ) ) == NULL )
 	{
 		return( LDAP_STRDUP( dn ) );
 	}
 
-
 	ufn = LDAP_STRDUP( ++p );
 
+	if( ufn == NULL ) return NULL;
+
 #define INQUOTE		1
 #define OUTQUOTE	2
 	state = OUTQUOTE;
-	for ( p = ufn, r = ufn; *p; p++ ) {
+	for ( p = ufn, r = ufn; *p; LDAP_UTF8_INCR(p) ) {
 		switch ( *p ) {
 		case '\\':
-			if ( *++p == '\0' )
-				p--;
-			else {
+			if ( p[1] != '\0' ) {
 				*r++ = '\\';
-				*r++ = *p;
+				LDAP_UTF8_COPY(r,++p);
+				LDAP_UTF8_INCR(r);
 			}
 			break;
+
 		case '"':
 			if ( state == INQUOTE )
 				state = OUTQUOTE;
@@ -92,6 +93,7 @@ ldap_dn2ufn( LDAP_CONST char *dn )
 				state = INQUOTE;
 			*r++ = *p;
 			break;
+
 		case ';':
 		case ',':
 			if ( state == OUTQUOTE )
@@ -99,17 +101,22 @@ ldap_dn2ufn( LDAP_CONST char *dn )
 			else
 				*r++ = *p;
 			break;
+
 		case '=':
-			if ( state == INQUOTE )
+			if ( state == INQUOTE ) {
 				*r++ = *p;
-			else {
+			} else {
 				char	*rsave = r;
 
-				*r-- = '\0';
-				while ( !isspace( (unsigned char) *r )
+				*r = '\0';
+				LDAP_UTF8_DECR( r );
+
+				while ( !ldap_utf8_isspace( r )
 					&& *r != ';' && *r != ',' && r > ufn )
-					r--;
-				r++;
+				{
+					LDAP_UTF8_DECR( r );
+				}
+				LDAP_UTF8_INCR( r );
 
 				if ( strcasecmp( r, "c" )
 				    && strcasecmp( r, "o" )
@@ -122,8 +129,10 @@ ldap_dn2ufn( LDAP_CONST char *dn )
 				}
 			}
 			break;
+
 		default:
-			*r++ = *p;
+			LDAP_UTF8_COPY(r, p);
+			LDAP_UTF8_INCR(r);
 			break;
 		}
 	}
@@ -184,14 +193,14 @@ ldap_explode_dn( LDAP_CONST char *dn, int notypes )
 	if ( ldap_is_dns_dn( dn ) ) {
 		return( ldap_explode_dns( dn ) );
 	}
-	return explode_name( dn, notypes, DN_TYPE_LDAP_DN );
+	return explode_name( dn, notypes, NAME_TYPE_LDAP_DN );
 }
 
 char **
 ldap_explode_rdn( LDAP_CONST char *rdn, int notypes )
 {
 	Debug( LDAP_DEBUG_TRACE, "ldap_explode_rdn\n", 0, 0, 0 );
-	return explode_name( rdn, notypes, DN_TYPE_LDAP_RDN );
+	return explode_name( rdn, notypes, NAME_TYPE_LDAP_RDN );
 }
 
 char *
@@ -202,7 +211,7 @@ ldap_dn2dcedn( LDAP_CONST char *dn )
 
 	Debug( LDAP_DEBUG_TRACE, "ldap_dn2dcedn\n", 0, 0, 0 );
 
-	rdns = explode_name( dn, 0, DN_TYPE_LDAP_DN );
+	rdns = explode_name( dn, 0, NAME_TYPE_LDAP_DN );
 	if ( rdns == NULL ) {
 		return NULL;
 	}
@@ -240,7 +249,7 @@ ldap_dcedn2dn( LDAP_CONST char *dce )
 
 	Debug( LDAP_DEBUG_TRACE, "ldap_dcedn2dn\n", 0, 0, 0 );
 
-	rdns = explode_name( dce, 0, DN_TYPE_DCE_DN );
+	rdns = explode_name( dce, 0, NAME_TYPE_DCE_DN );
 	if ( rdns == NULL ) {
 		return NULL;
 	}
@@ -279,22 +288,35 @@ ldap_dcedn2dn( LDAP_CONST char *dce )
 }
 
 static char **
-explode_name( const char *name, int notypes, int is_dn )
+explode_name( const char *name, int notypes, int is_type )
 {
-	const char *p, *q;
+	const char *p, *q, *rdn;
 	char **parts = NULL;
-	int	state, count = 0, endquote, len;
+	int	offset, state, have_equals, count = 0, endquote, len;
+
+	/* safe guard */
+	if(name == NULL) name = "";
+
+	/* skip leading whitespace */
+	while( ldap_utf8_isspace( name )) {
+		LDAP_UTF8_INCR( name );
+	}
 
-	p = name-1;
+	p = rdn = name;
+	offset = 0;
 	state = OUTQUOTE;
+	have_equals=0;
 
 	do {
+		/* step forward */
+		p += offset;
+		offset = 1;
 
-		++p;
 		switch ( *p ) {
 		case '\\':
-			if ( *++p == '\0' )
-				p--;
+			if ( p[1] != '\0' ) {
+				offset = LDAP_UTF8_OFFSET(++p);
+			}
 			break;
 		case '"':
 			if ( state == INQUOTE )
@@ -302,23 +324,28 @@ explode_name( const char *name, int notypes, int is_dn )
 			else
 				state = INQUOTE;
 			break;
+		case '=':
+			if( state = OUTQUOTE ) have_equals++;
+			break;
 		case '+':
-			if (is_dn == DN_TYPE_LDAP_RDN)
+			if (is_type == NAME_TYPE_LDAP_RDN)
 				goto end_part;
 			break;
 		case '/':
-			if (is_dn == DN_TYPE_DCE_DN)
+			if (is_type == NAME_TYPE_DCE_DN)
 				goto end_part;
 			break;
 		case ';':
 		case ',':
-			if (is_dn == DN_TYPE_LDAP_DN)
+			if (is_type == NAME_TYPE_LDAP_DN)
 				goto end_part;
 			break;
 		case '\0':
 		end_part:
 			if ( state == OUTQUOTE ) {
 				++count;
+				have_equals=0;
+
 				if ( parts == NULL ) {
 					if (( parts = (char **)LDAP_MALLOC( 8
 						 * sizeof( char *))) == NULL )
@@ -329,31 +356,45 @@ explode_name( const char *name, int notypes, int is_dn )
 						== NULL )
 						return( NULL );
 				}
+
 				parts[ count ] = NULL;
 				endquote = 0;
+
 				if ( notypes ) {
-					for ( q = name;
-					    q < p && *q != '='; ++q ) {
-						;
+					for ( q = rdn; q < p && *q != '='; ++q ) {
+						/* EMPTY */;
 					}
+
 					if ( q < p ) {
-						name = ++q;
+						rdn = ++q;
 					}
-					if ( *name == '"' ) {
-						++name;
+
+					if ( *rdn == '"' ) {
+						++rdn;
 					}
 					
-					if ( *(p-1) == '"' ) {
+					if ( p[-1] == '"' ) {
 						endquote = 1;
 						--p;
 					}
 				}
 
-				len = p - name;
+				len = p - rdn;
+
 				if (( parts[ count-1 ] = (char *)LDAP_CALLOC( 1,
-				    len + 1 )) != NULL ) {
-				    	SAFEMEMCPY( parts[ count-1 ], name,
-					    len );
+				    len + 1 )) != NULL )
+				{
+				   	SAFEMEMCPY( parts[ count-1 ], rdn, len );
+
+					if( !endquote ) {
+						/* skip trailing spaces */
+						while( len > 0 && ldap_utf8_isspace(
+							&parts[count-1][len-1] ) )
+						{
+							--len;
+						}
+					}
+
 					parts[ count-1 ][ len ] = '\0';
 				}
 
@@ -365,11 +406,10 @@ explode_name( const char *name, int notypes, int is_dn )
 				if ( endquote == 1 )
 					p++;
 
-				name = *p ? p + 1 : p;
-				while ( isascii( *name ) && isspace( *name ) )
-					++name;
-			}
-			break;
+				rdn = *p ? &p[1] : p;
+				while ( ldap_utf8_isspace( rdn ) )
+					++rdn;
+			} break;
 		}
 	} while ( *p );
 
@@ -380,9 +420,6 @@ explode_name( const char *name, int notypes, int is_dn )
 int
 ldap_is_dns_dn( LDAP_CONST char *dn )
 {
-	return( dn[ 0 ] != '\0'
-		&& strchr( dn, '=' ) == NULL
-		&& strchr( dn, ',' ) == NULL
-		&& strchr( dn, ';' ) == NULL );
+	return dn[ 0 ] != '\0' && ldap_utf8_strpbrk( dn, "=,;" ) == NULL;
 }
 
diff --git a/libraries/libldap/string.c b/libraries/libldap/string.c
index cd9994ad06..def6fce9fa 100644
--- a/libraries/libldap/string.c
+++ b/libraries/libldap/string.c
@@ -4,6 +4,11 @@
  * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
  */
 
+/*
+ * Locale-specific 1-byte character versions
+ * See utf-8.c for UTF-8 versions
+ */
+
 #include "portable.h"
 
 #include <ac/stdlib.h>
diff --git a/libraries/libldap/utf-8.c b/libraries/libldap/utf-8.c
index eadf1de118..2c0cf19082 100644
--- a/libraries/libldap/utf-8.c
+++ b/libraries/libldap/utf-8.c
@@ -60,13 +60,19 @@ ber_len_t ldap_utf8_chars( const char * p )
 	/* could be optimized and could check for invalid sequences */
 	ber_len_t chars=0;
 
-	for( ; *p ; p=LDAP_UTF8_NEXT(p) ) {
+	for( ; *p ; LDAP_UTF8_INCR(p) ) {
 		chars++;
 	};
 
 	return chars;
 }
 
+/* return offset to next character */
+int ldap_utf8_offset( const char * p )
+{
+	return LDAP_UTF8_NEXT(p) - p;
+}
+
 /*
  * Returns length indicated by first byte.
  *
@@ -111,7 +117,7 @@ ber_int_t ldap_utf8_to_ucs4( const char * p )
     ber_int_t ch;
 	int len, i;
 	static unsigned char mask[] = {
-		0, 0x7f, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
+		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
 	len = LDAP_UTF8_CHARLEN(p);
 
@@ -146,33 +152,33 @@ int ldap_ucs4_to_utf8( ber_int_t c, char *buf )
 
 	} else if( c < 0x800 ) {
 		p[len++] = 0xc0 | ( c >> 6 );
-		p[len++] = 0x80 | ( c & 0x3F );
+		p[len++] = 0x80 | ( c & 0x3f );
 
 	} else if( c < 0x10000 ) {
 		p[len++] = 0xe0 | ( c >> 12 );
-		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
-		p[len++] = 0x80 | ( c & 0x3F );
+		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
+		p[len++] = 0x80 | ( c & 0x3f );
 
 	} else if( c < 0x200000 ) {
 		p[len++] = 0xf0 | ( c >> 18 );
-		p[len++] = 0x80 | ( (c >> 12) & 0x3F );
-		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
-		p[len++] = 0x80 | ( c & 0x3F );
+		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
+		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
+		p[len++] = 0x80 | ( c & 0x3f );
 
 	} else if( c < 0x400000 ) {
 		p[len++] = 0xf8 | ( c >> 24 );
-		p[len++] = 0x80 | ( (c >> 18) & 0x3F );
-		p[len++] = 0x80 | ( (c >> 12) & 0x3F );
-		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
-		p[len++] = 0x80 | ( c & 0x3F );
+		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
+		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
+		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
+		p[len++] = 0x80 | ( c & 0x3f );
 
 	} else /* if( c < 0x80000000 ) */ {
 		p[len++] = 0xfc | ( c >> 30 );
-		p[len++] = 0x80 | ( (c >> 24) & 0x3F );
-		p[len++] = 0x80 | ( (c >> 18) & 0x3F );
-		p[len++] = 0x80 | ( (c >> 12) & 0x3F );
-		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
-		p[len++] = 0x80 | ( c & 0x3F );
+		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
+		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
+		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
+		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
+		p[len++] = 0x80 | ( c & 0x3f );
 	}
 
 	buf[len] = '\0';
@@ -198,7 +204,7 @@ char* ldap_utf8_next( const char * p )
 	}
 
 	for( i=1; i<6; i++ ) {
-		if ( u[i] & 0xC0 != 0x80 ) {
+		if ( u[i] & 0xc0 != 0x80 ) {
 			return (char *) &p[i];
 		}
 	}
@@ -221,7 +227,7 @@ char* ldap_utf8_prev( const char * p )
 	const unsigned char *u = p;
 
 	for( i=-1; i>-6 ; i-- ) {
-		if ( u[i] & 0xC0 != 0x80 ) {
+		if ( u[i] & 0xc0 != 0x80 ) {
 			return (char *) &p[i];
 		}
 	}
@@ -251,7 +257,7 @@ int ldap_utf8_copy( char* dst, const char *src )
 	}
 
 	for( i=1; i<6; i++ ) {
-		if ( u[i] & 0xC0 != 0x80 ) {
+		if ( u[i] & 0xc0 != 0x80 ) {
 			return i; 
 		}
 		dst[i] = src[i];
@@ -340,7 +346,7 @@ int ldap_utf8_islower( const char * p )
 {
 	unsigned c = * (const unsigned char *) p;
 
-	if(!UTF8_ISASCII(c)) return 0;
+	if(!ISASCII(c)) return 0;
 
 	return ( c >= 'a' && c <= 'z' );
 }
@@ -360,15 +366,26 @@ int ldap_utf8_isupper( const char * p )
  * UTF-8 string routines
  */
 
+/* like strchr() */
+char * (ldap_utf8_strchr)( const char *str, const char *chr )
+{
+	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
+		if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( chr ) ) {
+			return (char *) str;
+		} 
+	}
+
+	return NULL;
+}
+
 /* like strcspn() but returns number of bytes, not characters */
 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 {
 	const char *cstr;
+	const char *cset;
 
-	for( cstr = str; *cstr != '\0'; cstr = LDAP_UTF8_NEXT(cstr) ) {
-		const char *cset;
-
-		for( cset = set; ; cset = LDAP_UTF8_NEXT(cset) ) {
+	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
+		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
 			if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
 				return cstr - str;
 			} 
@@ -382,9 +399,9 @@ ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 {
 	const char *cstr;
+	const char *cset;
 
 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
-		const char *cset;
 
 		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
 			if( *cset == '\0' ) {
@@ -404,14 +421,13 @@ ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
 {
 	int len;
-	const char *cstr;
 
-	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
+	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
 		const char *cset;
 
-		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
-			if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
-				return (char *) cstr;
+		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
+			if( ldap_utf8_to_ucs4( str ) == ldap_utf8_to_ucs4( cset ) ) {
+				return (char *) str;
 			} 
 		}
 	}
@@ -436,7 +452,7 @@ char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
 		return NULL;
 	}
 
-	end = &begin[ ldap_utf8_strcpn( begin, sep ) ];
+	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
 
 	if( *end != '\0' ) {
 		char *next = LDAP_UTF8_NEXT( end );
-- 
2.39.5