3 * Copyright 2002-2003 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
8 * Basic T.61 <-> UTF-8 conversion
10 * These routines will perform a lossless translation from T.61 to UTF-8
11 * and a lossy translation from UTF-8 to T.61.
18 #include <ac/stdlib.h>
20 #include <ac/socket.h>
21 #include <ac/string.h>
25 #include "ldap_utf8.h"
27 #include "ldap_defaults.h"
30 * T.61 is somewhat braindead; even in the 7-bit space it is not
31 * completely equivalent to 7-bit US-ASCII. Our definition of the
32 * character set comes from RFC 1345 with a slightly more readable
33 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
35 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
36 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
47 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
48 * accents of some form or another. There are predefined combinations
49 * for certain characters, but they can also be used arbitrarily. The
50 * table at dkuug.dk maps these accents to the E000 "private use" range
51 * of the Unicode space, but I believe they more properly belong in the
52 * 0300 range (non-spacing accents). The transformation is complicated
53 * slightly because Unicode wants the non-spacing character to follow
54 * the base character, while T.61 has the non-spacing character leading.
55 * Also, T.61 specifically recognizes certain combined pairs as "characters"
56 * but doesn't specify how to treat unrecognized pairs. This code will
57 * always attempt to combine pairs when a known Unicode composite exists.
60 static const wchar_t t61_tab[] = {
61 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
62 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
63 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
64 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
65 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
66 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
67 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
68 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
69 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
70 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
71 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
72 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
73 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
74 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
75 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
76 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
77 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
78 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
79 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
80 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
81 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
82 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
83 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
84 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
85 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
86 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
87 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
88 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
89 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
90 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
91 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
92 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
95 typedef wchar_t wvec16[16];
96 typedef wchar_t wvec32[32];
97 typedef wchar_t wvec64[64];
99 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
100 static const wvec16 accents = {
101 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
102 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
104 /* In the following tables, base characters commented in (parentheses)
105 * are not defined by T.61 but are mapped anyway since their Unicode
109 /* Grave accented chars AEIOU (NWY) */
110 static const wvec32 c1_vec1 = {
112 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
113 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
114 static const wvec32 c1_vec2 = {
116 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
117 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
119 static const wvec32 *c1_grave[] = {
120 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
123 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
124 static const wvec32 c2_vec1 = {
126 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
127 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
128 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
129 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
130 static const wvec32 c2_vec2 = {
132 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
133 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
134 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
135 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
136 static const wvec32 c2_vec3 = {
138 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
141 static const wvec32 *c2_acute[] = {
142 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
145 /* Circumflex AEIOUYCGHJSW (Z) */
146 static const wvec32 c3_vec1 = {
148 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
149 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
150 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
151 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
152 static const wvec32 c3_vec2 = {
154 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
155 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
156 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
157 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
158 static const wvec32 *c3_circumflex[] = {
159 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
162 /* Tilde AIOUN (EVY) */
163 static const wvec32 c4_vec1 = {
165 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
166 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
167 static const wvec32 c4_vec2 = {
169 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
170 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
171 static const wvec32 *c4_tilde[] = {
172 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
175 /* Macron AEIOU (YG) */
176 static const wvec32 c5_vec1 = {
178 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
179 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
180 static const wvec32 c5_vec2 = {
182 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
183 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
184 static const wvec32 c5_vec3 = {
186 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
188 static const wvec32 *c5_macron[] = {
189 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
192 /* Breve AUG (EIO) */
193 static const wvec32 c6_vec1 = {
195 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
196 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
197 static const wvec32 c6_vec2 = {
199 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
200 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201 static const wvec32 *c6_breve[] = {
202 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
205 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
206 static const wvec32 c7_vec1 = {
208 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
209 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
210 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
211 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
212 static const wvec32 c7_vec2 = {
214 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
215 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
216 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
217 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
218 static const wvec32 *c7_dotabove[] = {
219 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
222 /* Diaeresis AEIOUY (HWXt) */
223 static const wvec32 c8_vec1 = {
225 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
226 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
227 static const wvec32 c8_vec2 = {
229 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
230 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
231 static const wvec32 *c8_diaeresis[] = {
232 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
235 /* Ring Above AU (wy) */
236 static const wvec32 ca_vec1 = {
238 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
240 static const wvec32 ca_vec2 = {
242 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
244 static const wvec32 *ca_ringabove[] = {
245 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
248 /* Cedilla CGKLNRST (EDH) */
249 static const wvec32 cb_vec1 = {
251 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
252 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
253 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
254 static const wvec32 cb_vec2 = {
256 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
257 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
258 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
259 static const wvec32 *cb_cedilla[] = {
260 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
263 /* Double Acute Accent OU */
264 static const wvec32 cd_vec1 = {
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
267 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
268 static const wvec32 cd_vec2 = {
270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
271 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272 static const wvec32 *cd_doubleacute[] = {
273 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
276 /* Ogonek AEIU (O) */
277 static const wvec32 ce_vec1 = {
279 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
280 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
281 static const wvec32 ce_vec2 = {
283 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
284 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
285 static const wvec32 *ce_ogonek[] = {
286 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
289 /* Caron CDELNRSTZ (AIOUGKjH) */
290 static const wvec32 cf_vec1 = {
292 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
293 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
294 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
295 0, 0, 0x17d, 0, 0, 0, 0, 0};
296 static const wvec32 cf_vec2 = {
298 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
299 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
300 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
301 0, 0, 0x17e, 0, 0, 0, 0, 0};
302 static const wvec32 *cf_caron[] = {
303 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
306 static const wvec32 **cx_tab[] = {
307 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
308 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
309 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
311 int ldap_t61s_valid( struct berval *str )
313 unsigned char *c = (unsigned char *)str->bv_val;
316 for (i=0; i < str->bv_len; c++,i++)
322 /* Transform a T.61 string to UTF-8.
324 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
330 /* Just count the length of the UTF-8 result first */
331 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
332 /* Invalid T.61 characters? */
334 return LDAP_INVALID_SYNTAX;
335 if (*c & 0xf0 == 0xc0) {
337 /* If this is the end of the string, or if the base
338 * character is just a space, treat this as a regular
341 if ((!c[1] || c[1] == 0x20) && accents[j]) {
342 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
343 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
344 /* We have a composite mapping for this pair */
345 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
346 wlen += ldap_x_wc_to_utf8( NULL,
347 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
349 /* No mapping, just swap it around so the base
350 * character comes first.
352 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
353 wlen += ldap_x_wc_to_utf8(NULL,
359 wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
363 /* Now transform the string */
365 dst->bv_val = LDAP_MALLOC( wlen+1 );
368 return LDAP_NO_MEMORY;
370 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
371 if (*c & 0xf0 == 0xc0) {
373 /* If this is the end of the string, or if the base
374 * character is just a space, treat this as a regular
377 if ((!c[1] || c[1] == 0x20) && accents[j]) {
378 d += ldap_x_wc_to_utf8(d, accents[j], 6);
379 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
380 /* We have a composite mapping for this pair */
381 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
382 d += ldap_x_wc_to_utf8(d,
383 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
385 /* No mapping, just swap it around so the base
386 * character comes first.
388 d += ldap_x_wc_to_utf8(d, c[1], 6);
389 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
394 d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
401 /* For the reverse mapping, we just pay attention to the Latin-oriented
402 * code blocks. These are
403 * 0000 - 007f Basic Latin
404 * 0080 - 00ff Latin-1 Supplement
405 * 0100 - 017f Latin Extended-A
406 * 0180 - 024f Latin Extended-B
407 * 1e00 - 1eff Latin Extended Additional
409 * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
410 * unrecognized characters are replaced with '?' 0x3f.
413 static const wvec64 u000 = {
414 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
415 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
416 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
417 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
418 0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
419 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
420 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
421 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
423 /* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
424 * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
425 * on their own, even though it provides them as combiners for other
426 * letters. T.61 doesn't define these pairings either, so this may just
427 * have to be replaced with '?' 0x3f if other software can't cope with it.
429 static const wvec64 u001 = {
430 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
431 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
432 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
433 0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
434 0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
435 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
436 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
437 0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
439 static const wvec64 u002 = {
440 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
441 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
442 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
443 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
444 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
445 0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
446 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
447 0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
449 static const wvec64 u003 = {
450 0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
451 0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
452 0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
453 0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
454 0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
455 0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
456 0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
457 0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
459 /* These codes are used here but not defined by T.61:
460 * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
462 static const wvec64 u010 = {
463 0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
464 0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
465 0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
466 0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
467 0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
468 0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
469 0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
470 0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
472 /* These codes are used here but not defined by T.61:
473 * x14e = xc6/x4f, x14f = xc6/x6f
475 static const wvec64 u011 = {
476 0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
477 0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
478 0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
479 0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
480 0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
481 0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
482 0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
483 0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
485 /* All of the codes in this block are undefined in T.61.
487 static const wvec64 u013 = {
488 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
489 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
490 0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f,
491 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
492 0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
493 0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
494 0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
495 0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
497 /* All of the codes in this block are undefined in T.61.
499 static const wvec64 u020 = {
500 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
501 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
502 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
503 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
504 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
505 0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
506 0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
507 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
509 static const wvec64 u023 = {
510 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
511 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
512 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
513 0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
514 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
515 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
516 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
517 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
519 /* These are the non-spacing characters by themselves. They should
520 * never appear by themselves in actual text.
522 static const wvec64 u030 = {
523 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
524 0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
525 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
526 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
527 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
528 0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
529 0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
530 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
532 /* None of the following blocks are defined in T.61.
534 static const wvec64 u1e0 = {
535 0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f,
536 0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
537 0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
538 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
539 0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
540 0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
541 0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
542 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
545 static const wvec64 u1e1 = {
546 0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
547 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
548 0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
549 0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
550 0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
551 0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
552 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
553 0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
556 static const wvec64 u1e2 = {
557 0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
558 0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
559 0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
560 0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
561 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
562 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
563 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
564 0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
567 static const wvec64 u1e3 = {
568 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
569 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
570 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
571 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
572 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
573 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
574 0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
575 0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
578 static const wvec64 *wc00[] = {
579 &u000, &u001, &u002, &u003,
580 &u010, &u011, NULL, &u013,
581 &u020, NULL, NULL, &u023,
582 &u030, NULL, NULL, NULL};
584 static const wvec64 *wc1e[] = {
585 &u1e0, &u1e1, &u1e2, &u1e3};
588 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
594 /* Just count the length of the T.61 result first */
595 for (i=0,c=src->bv_val; i < src->bv_len;) {
596 j = ldap_x_utf8_to_wc( &tmp, c );
598 return LDAP_INVALID_SYNTAX;
604 if (wc00[tmp >> 6] &&
605 ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
611 if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
623 dst->bv_val = LDAP_MALLOC( tlen+1 );
625 return LDAP_NO_MEMORY;
628 for (i=0,c=src->bv_val; i < src->bv_len;) {
629 j = ldap_x_utf8_to_wc( &tmp, c );
634 if (wc00[tmp >> 6]) {
635 tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
644 /* swap order of non-spacing characters */
645 if (wc00[tmp >> 6]) {
646 wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
659 tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];