2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 * Copyright 2002-2012 The OpenLDAP Foundation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
11 * A copy of this license is available in the file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
16 * This work was initially developed by Howard Chu for inclusion in
21 * Basic T.61 <-> UTF-8 conversion
23 * These routines will perform a lossless translation from T.61 to UTF-8
24 * and a lossy translation from UTF-8 to T.61.
31 #include <ac/stdlib.h>
33 #include <ac/socket.h>
34 #include <ac/string.h>
38 #include "ldap_utf8.h"
40 #include "ldap_defaults.h"
43 * T.61 is somewhat braindead; even in the 7-bit space it is not
44 * completely equivalent to 7-bit US-ASCII. Our definition of the
45 * character set comes from RFC 1345 with a slightly more readable
46 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
48 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
49 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
60 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
61 * accents of some form or another. There are predefined combinations
62 * for certain characters, but they can also be used arbitrarily. The
63 * table at dkuug.dk maps these accents to the E000 "private use" range
64 * of the Unicode space, but I believe they more properly belong in the
65 * 0300 range (non-spacing accents). The transformation is complicated
66 * slightly because Unicode wants the non-spacing character to follow
67 * the base character, while T.61 has the non-spacing character leading.
68 * Also, T.61 specifically recognizes certain combined pairs as "characters"
69 * but doesn't specify how to treat unrecognized pairs. This code will
70 * always attempt to combine pairs when a known Unicode composite exists.
73 static const wchar_t t61_tab[] = {
74 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
75 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
76 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
77 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
78 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
79 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
80 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
81 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
82 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
83 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
84 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
85 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
86 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
87 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
88 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
89 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
90 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
91 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
92 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
93 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
94 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
95 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
96 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
97 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
98 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
99 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
100 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
101 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
102 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
103 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
104 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
105 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
108 typedef wchar_t wvec16[16];
109 typedef wchar_t wvec32[32];
110 typedef wchar_t wvec64[64];
112 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
113 static const wvec16 accents = {
114 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
115 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
117 /* In the following tables, base characters commented in (parentheses)
118 * are not defined by T.61 but are mapped anyway since their Unicode
122 /* Grave accented chars AEIOU (NWY) */
123 static const wvec32 c1_vec1 = {
125 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
126 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
127 static const wvec32 c1_vec2 = {
129 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
130 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
132 static const wvec32 *c1_grave[] = {
133 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
136 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
137 static const wvec32 c2_vec1 = {
139 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
140 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
141 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
142 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
143 static const wvec32 c2_vec2 = {
145 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
146 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
147 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
148 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
149 static const wvec32 c2_vec3 = {
151 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
154 static const wvec32 *c2_acute[] = {
155 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
158 /* Circumflex AEIOUYCGHJSW (Z) */
159 static const wvec32 c3_vec1 = {
161 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
162 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
163 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
164 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
165 static const wvec32 c3_vec2 = {
167 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
168 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
169 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
170 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
171 static const wvec32 *c3_circumflex[] = {
172 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
175 /* Tilde AIOUN (EVY) */
176 static const wvec32 c4_vec1 = {
178 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
179 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
180 static const wvec32 c4_vec2 = {
182 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
183 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
184 static const wvec32 *c4_tilde[] = {
185 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
188 /* Macron AEIOU (YG) */
189 static const wvec32 c5_vec1 = {
191 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
192 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
193 static const wvec32 c5_vec2 = {
195 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
196 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
197 static const wvec32 c5_vec3 = {
199 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201 static const wvec32 *c5_macron[] = {
202 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
205 /* Breve AUG (EIO) */
206 static const wvec32 c6_vec1 = {
208 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
209 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210 static const wvec32 c6_vec2 = {
212 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
213 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
214 static const wvec32 *c6_breve[] = {
215 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
218 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
219 static const wvec32 c7_vec1 = {
221 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
222 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
223 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
224 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
225 static const wvec32 c7_vec2 = {
227 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
228 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
229 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
230 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
231 static const wvec32 *c7_dotabove[] = {
232 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
235 /* Diaeresis AEIOUY (HWXt) */
236 static const wvec32 c8_vec1 = {
238 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
239 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
240 static const wvec32 c8_vec2 = {
242 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
243 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
244 static const wvec32 *c8_diaeresis[] = {
245 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
248 /* Ring Above AU (wy) */
249 static const wvec32 ca_vec1 = {
251 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
253 static const wvec32 ca_vec2 = {
255 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
257 static const wvec32 *ca_ringabove[] = {
258 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
261 /* Cedilla CGKLNRST (EDH) */
262 static const wvec32 cb_vec1 = {
264 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
265 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
266 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
267 static const wvec32 cb_vec2 = {
269 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
270 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
271 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272 static const wvec32 *cb_cedilla[] = {
273 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
276 /* Double Acute Accent OU */
277 static const wvec32 cd_vec1 = {
279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
280 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
281 static const wvec32 cd_vec2 = {
283 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
284 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
285 static const wvec32 *cd_doubleacute[] = {
286 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
289 /* Ogonek AEIU (O) */
290 static const wvec32 ce_vec1 = {
292 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
293 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
294 static const wvec32 ce_vec2 = {
296 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
297 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
298 static const wvec32 *ce_ogonek[] = {
299 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
302 /* Caron CDELNRSTZ (AIOUGKjH) */
303 static const wvec32 cf_vec1 = {
305 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
306 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
307 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
308 0, 0, 0x17d, 0, 0, 0, 0, 0};
309 static const wvec32 cf_vec2 = {
311 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
312 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
313 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
314 0, 0, 0x17e, 0, 0, 0, 0, 0};
315 static const wvec32 *cf_caron[] = {
316 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
319 static const wvec32 **cx_tab[] = {
320 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
321 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
322 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
324 int ldap_t61s_valid( struct berval *str )
326 unsigned char *c = (unsigned char *)str->bv_val;
329 for (i=0; i < str->bv_len; c++,i++)
335 /* Transform a T.61 string to UTF-8.
337 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
343 /* Just count the length of the UTF-8 result first */
344 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
345 /* Invalid T.61 characters? */
347 return LDAP_INVALID_SYNTAX;
348 if ((*c & 0xf0) == 0xc0) {
350 /* If this is the end of the string, or if the base
351 * character is just a space, treat this as a regular
354 if ((!c[1] || c[1] == 0x20) && accents[j]) {
355 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
356 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
357 /* We have a composite mapping for this pair */
358 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
359 wlen += ldap_x_wc_to_utf8( NULL,
360 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
362 /* No mapping, just swap it around so the base
363 * character comes first.
365 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
366 wlen += ldap_x_wc_to_utf8(NULL,
372 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
376 /* Now transform the string */
378 dst->bv_val = LDAP_MALLOC( wlen+1 );
381 return LDAP_NO_MEMORY;
383 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
384 if ((*c & 0xf0) == 0xc0) {
386 /* If this is the end of the string, or if the base
387 * character is just a space, treat this as a regular
390 if ((!c[1] || c[1] == 0x20) && accents[j]) {
391 d += ldap_x_wc_to_utf8(d, accents[j], 6);
392 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
393 /* We have a composite mapping for this pair */
394 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
395 d += ldap_x_wc_to_utf8(d,
396 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
398 /* No mapping, just swap it around so the base
399 * character comes first.
401 d += ldap_x_wc_to_utf8(d, c[1], 6);
402 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
407 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
414 /* For the reverse mapping, we just pay attention to the Latin-oriented
415 * code blocks. These are
416 * 0000 - 007f Basic Latin
417 * 0080 - 00ff Latin-1 Supplement
418 * 0100 - 017f Latin Extended-A
419 * 0180 - 024f Latin Extended-B
420 * 1e00 - 1eff Latin Extended Additional
422 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
423 * unrecognized characters are replaced with '?' 0x3f.
426 static const wvec64 u000 = {
427 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
428 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
429 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
430 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
431 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
432 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
433 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
434 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
436 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
437 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
438 * on their own, even though it provides them as combiners for other
439 * letters. T.61 doesn't define these pairings either, so this may just
440 * have to be replaced with '?' 0x3f if other software can't cope with it.
442 static const wvec64 u001 = {
443 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
444 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
445 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
446 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
447 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
448 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
449 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
450 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
452 static const wvec64 u002 = {
453 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
454 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
455 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
456 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
457 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
458 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
459 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
460 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
462 static const wvec64 u003 = {
463 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
464 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
465 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
466 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
467 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
468 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
469 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
470 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
472 /* These codes are used here but not defined by T.61:
473 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
475 static const wvec64 u010 = {
476 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
477 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
478 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
479 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
480 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
481 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
482 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
483 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
485 /* These codes are used here but not defined by T.61:
486 * x14e = xc6/x4f, x14f = xc6/x6f
488 static const wvec64 u011 = {
489 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
490 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
491 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
492 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
493 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
494 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
495 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
496 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
498 /* All of the codes in this block are undefined in T.61.
500 static const wvec64 u013 = {
501 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
502 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
503 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
504 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
505 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
506 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
507 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
508 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
510 /* All of the codes in this block are undefined in T.61.
512 static const wvec64 u020 = {
513 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
514 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
515 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
516 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
517 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
518 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
519 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
520 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
522 static const wvec64 u023 = {
523 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
524 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
526 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
527 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
528 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
529 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
532 /* These are the non-spacing characters by themselves. They should
533 * never appear by themselves in actual text.
535 static const wvec64 u030 = {
536 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
537 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
538 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
539 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
540 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
541 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
542 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
543 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
545 /* None of the following blocks are defined in T.61.
547 static const wvec64 u1e0 = {
548 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
549 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
550 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
551 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
552 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
553 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
554 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
555 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
558 static const wvec64 u1e1 = {
559 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
560 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
561 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
562 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
563 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
564 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
565 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
566 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
569 static const wvec64 u1e2 = {
570 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
571 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
572 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
573 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
574 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
575 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
576 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
577 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
580 static const wvec64 u1e3 = {
581 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
583 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
584 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
585 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
586 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
588 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
591 static const wvec64 *wc00[] = {
592 &u000, &u001, &u002, &u003,
593 &u010, &u011, NULL, &u013,
594 &u020, NULL, NULL, &u023,
595 &u030, NULL, NULL, NULL};
597 static const wvec64 *wc1e[] = {
598 &u1e0, &u1e1, &u1e2, &u1e3};
601 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
607 /* Just count the length of the T.61 result first */
608 for (i=0,c=src->bv_val; i < src->bv_len;) {
609 j = ldap_x_utf8_to_wc( &tmp, c );
611 return LDAP_INVALID_SYNTAX;
617 if (wc00[tmp >> 6] &&
618 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
624 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
636 dst->bv_val = LDAP_MALLOC( tlen+1 );
638 return LDAP_NO_MEMORY;
641 for (i=0,c=src->bv_val; i < src->bv_len;) {
642 j = ldap_x_utf8_to_wc( &tmp, c );
647 if (wc00[tmp >> 6]) {
648 tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
657 /* swap order of non-spacing characters */
658 if (wc00[tmp >> 6]) {
659 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
672 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];