2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4 * Copyright 1998-2003 The OpenLDAP Foundation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
11 * A copy of this license is available in the file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
17 * Basic T.61 <-> UTF-8 conversion
19 * These routines will perform a lossless translation from T.61 to UTF-8
20 * and a lossy translation from UTF-8 to T.61.
27 #include <ac/stdlib.h>
29 #include <ac/socket.h>
30 #include <ac/string.h>
34 #include "ldap_utf8.h"
36 #include "ldap_defaults.h"
39 * T.61 is somewhat braindead; even in the 7-bit space it is not
40 * completely equivalent to 7-bit US-ASCII. Our definition of the
41 * character set comes from RFC 1345 with a slightly more readable
42 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
44 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
45 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
56 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
57 * accents of some form or another. There are predefined combinations
58 * for certain characters, but they can also be used arbitrarily. The
59 * table at dkuug.dk maps these accents to the E000 "private use" range
60 * of the Unicode space, but I believe they more properly belong in the
61 * 0300 range (non-spacing accents). The transformation is complicated
62 * slightly because Unicode wants the non-spacing character to follow
63 * the base character, while T.61 has the non-spacing character leading.
64 * Also, T.61 specifically recognizes certain combined pairs as "characters"
65 * but doesn't specify how to treat unrecognized pairs. This code will
66 * always attempt to combine pairs when a known Unicode composite exists.
69 static const wchar_t t61_tab[] = {
70 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
71 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
72 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
73 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
74 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
75 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
76 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
77 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
78 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
79 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
80 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
81 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
82 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
83 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
84 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
85 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
86 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
87 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
88 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
89 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
90 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
91 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
92 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
93 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
94 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
95 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
96 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
97 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
98 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
99 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
100 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
101 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
104 typedef wchar_t wvec16[16];
105 typedef wchar_t wvec32[32];
106 typedef wchar_t wvec64[64];
108 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
109 static const wvec16 accents = {
110 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
111 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
113 /* In the following tables, base characters commented in (parentheses)
114 * are not defined by T.61 but are mapped anyway since their Unicode
118 /* Grave accented chars AEIOU (NWY) */
119 static const wvec32 c1_vec1 = {
121 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
122 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
123 static const wvec32 c1_vec2 = {
125 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
126 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
128 static const wvec32 *c1_grave[] = {
129 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
132 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
133 static const wvec32 c2_vec1 = {
135 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
136 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
137 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
138 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
139 static const wvec32 c2_vec2 = {
141 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
142 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
143 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
144 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
145 static const wvec32 c2_vec3 = {
147 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
150 static const wvec32 *c2_acute[] = {
151 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
154 /* Circumflex AEIOUYCGHJSW (Z) */
155 static const wvec32 c3_vec1 = {
157 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
158 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
159 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
160 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
161 static const wvec32 c3_vec2 = {
163 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
164 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
165 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
166 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
167 static const wvec32 *c3_circumflex[] = {
168 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
171 /* Tilde AIOUN (EVY) */
172 static const wvec32 c4_vec1 = {
174 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
175 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
176 static const wvec32 c4_vec2 = {
178 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
179 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
180 static const wvec32 *c4_tilde[] = {
181 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
184 /* Macron AEIOU (YG) */
185 static const wvec32 c5_vec1 = {
187 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
188 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
189 static const wvec32 c5_vec2 = {
191 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
192 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
193 static const wvec32 c5_vec3 = {
195 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
197 static const wvec32 *c5_macron[] = {
198 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
201 /* Breve AUG (EIO) */
202 static const wvec32 c6_vec1 = {
204 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
205 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
206 static const wvec32 c6_vec2 = {
208 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
209 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210 static const wvec32 *c6_breve[] = {
211 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
214 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
215 static const wvec32 c7_vec1 = {
217 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
218 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
219 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
220 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
221 static const wvec32 c7_vec2 = {
223 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
224 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
225 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
226 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
227 static const wvec32 *c7_dotabove[] = {
228 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
231 /* Diaeresis AEIOUY (HWXt) */
232 static const wvec32 c8_vec1 = {
234 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
235 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
236 static const wvec32 c8_vec2 = {
238 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
239 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
240 static const wvec32 *c8_diaeresis[] = {
241 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
244 /* Ring Above AU (wy) */
245 static const wvec32 ca_vec1 = {
247 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
249 static const wvec32 ca_vec2 = {
251 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
253 static const wvec32 *ca_ringabove[] = {
254 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
257 /* Cedilla CGKLNRST (EDH) */
258 static const wvec32 cb_vec1 = {
260 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
261 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
262 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
263 static const wvec32 cb_vec2 = {
265 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
266 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
267 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
268 static const wvec32 *cb_cedilla[] = {
269 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
272 /* Double Acute Accent OU */
273 static const wvec32 cd_vec1 = {
275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
276 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
277 static const wvec32 cd_vec2 = {
279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
280 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
281 static const wvec32 *cd_doubleacute[] = {
282 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
285 /* Ogonek AEIU (O) */
286 static const wvec32 ce_vec1 = {
288 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
289 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
290 static const wvec32 ce_vec2 = {
292 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
293 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
294 static const wvec32 *ce_ogonek[] = {
295 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
298 /* Caron CDELNRSTZ (AIOUGKjH) */
299 static const wvec32 cf_vec1 = {
301 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
302 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
303 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
304 0, 0, 0x17d, 0, 0, 0, 0, 0};
305 static const wvec32 cf_vec2 = {
307 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
308 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
309 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
310 0, 0, 0x17e, 0, 0, 0, 0, 0};
311 static const wvec32 *cf_caron[] = {
312 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
315 static const wvec32 **cx_tab[] = {
316 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
317 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
318 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
320 int ldap_t61s_valid( struct berval *str )
322 unsigned char *c = (unsigned char *)str->bv_val;
325 for (i=0; i < str->bv_len; c++,i++)
331 /* Transform a T.61 string to UTF-8.
333 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
339 /* Just count the length of the UTF-8 result first */
340 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
341 /* Invalid T.61 characters? */
343 return LDAP_INVALID_SYNTAX;
344 if ((*c & 0xf0) == 0xc0) {
346 /* If this is the end of the string, or if the base
347 * character is just a space, treat this as a regular
350 if ((!c[1] || c[1] == 0x20) && accents[j]) {
351 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
352 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
353 /* We have a composite mapping for this pair */
354 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
355 wlen += ldap_x_wc_to_utf8( NULL,
356 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
358 /* No mapping, just swap it around so the base
359 * character comes first.
361 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
362 wlen += ldap_x_wc_to_utf8(NULL,
368 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
372 /* Now transform the string */
374 dst->bv_val = LDAP_MALLOC( wlen+1 );
377 return LDAP_NO_MEMORY;
379 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
380 if ((*c & 0xf0) == 0xc0) {
382 /* If this is the end of the string, or if the base
383 * character is just a space, treat this as a regular
386 if ((!c[1] || c[1] == 0x20) && accents[j]) {
387 d += ldap_x_wc_to_utf8(d, accents[j], 6);
388 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
389 /* We have a composite mapping for this pair */
390 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
391 d += ldap_x_wc_to_utf8(d,
392 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
394 /* No mapping, just swap it around so the base
395 * character comes first.
397 d += ldap_x_wc_to_utf8(d, c[1], 6);
398 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
403 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
410 /* For the reverse mapping, we just pay attention to the Latin-oriented
411 * code blocks. These are
412 * 0000 - 007f Basic Latin
413 * 0080 - 00ff Latin-1 Supplement
414 * 0100 - 017f Latin Extended-A
415 * 0180 - 024f Latin Extended-B
416 * 1e00 - 1eff Latin Extended Additional
418 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
419 * unrecognized characters are replaced with '?' 0x3f.
422 static const wvec64 u000 = {
423 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
424 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
425 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
426 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
427 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
428 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
429 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
430 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
432 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
433 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
434 * on their own, even though it provides them as combiners for other
435 * letters. T.61 doesn't define these pairings either, so this may just
436 * have to be replaced with '?' 0x3f if other software can't cope with it.
438 static const wvec64 u001 = {
439 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
440 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
441 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
442 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
443 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
444 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
445 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
446 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
448 static const wvec64 u002 = {
449 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
450 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
451 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
452 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
453 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
454 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
455 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
456 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
458 static const wvec64 u003 = {
459 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
460 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
461 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
462 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
463 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
464 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
465 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
466 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
468 /* These codes are used here but not defined by T.61:
469 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
471 static const wvec64 u010 = {
472 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
473 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
474 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
475 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
476 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
477 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
478 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
479 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
481 /* These codes are used here but not defined by T.61:
482 * x14e = xc6/x4f, x14f = xc6/x6f
484 static const wvec64 u011 = {
485 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
486 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
487 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
488 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
489 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
490 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
491 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
492 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
494 /* All of the codes in this block are undefined in T.61.
496 static const wvec64 u013 = {
497 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
498 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
499 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
500 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
501 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
502 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
503 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
504 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
506 /* All of the codes in this block are undefined in T.61.
508 static const wvec64 u020 = {
509 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
510 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
511 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
512 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
513 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
514 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
515 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
516 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
518 static const wvec64 u023 = {
519 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
520 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
521 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
522 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
523 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
524 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
526 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
528 /* These are the non-spacing characters by themselves. They should
529 * never appear by themselves in actual text.
531 static const wvec64 u030 = {
532 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
533 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
534 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
535 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
536 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
537 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
538 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
539 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
541 /* None of the following blocks are defined in T.61.
543 static const wvec64 u1e0 = {
544 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
545 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
546 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
547 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
548 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
549 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
550 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
551 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
554 static const wvec64 u1e1 = {
555 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
556 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
557 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
558 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
559 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
560 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
561 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
562 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
565 static const wvec64 u1e2 = {
566 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
567 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
568 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
569 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
570 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
571 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
572 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
573 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
576 static const wvec64 u1e3 = {
577 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
578 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
579 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
580 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
581 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
582 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
583 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
584 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
587 static const wvec64 *wc00[] = {
588 &u000, &u001, &u002, &u003,
589 &u010, &u011, NULL, &u013,
590 &u020, NULL, NULL, &u023,
591 &u030, NULL, NULL, NULL};
593 static const wvec64 *wc1e[] = {
594 &u1e0, &u1e1, &u1e2, &u1e3};
597 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
603 /* Just count the length of the T.61 result first */
604 for (i=0,c=src->bv_val; i < src->bv_len;) {
605 j = ldap_x_utf8_to_wc( &tmp, c );
607 return LDAP_INVALID_SYNTAX;
613 if (wc00[tmp >> 6] &&
614 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
620 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
632 dst->bv_val = LDAP_MALLOC( tlen+1 );
634 return LDAP_NO_MEMORY;
637 for (i=0,c=src->bv_val; i < src->bv_len;) {
638 j = ldap_x_utf8_to_wc( &tmp, c );
643 if (wc00[tmp >> 6]) {
644 tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
653 /* swap order of non-spacing characters */
654 if (wc00[tmp >> 6]) {
655 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
668 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];