3 * Copyright 2002 The OpenLDAP Foundation, All Rights Reserved.
4 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
8 * Basic T.61 <-> UTF-8 conversion
10 * These routines will perform a lossless translation from T.61 to UTF-8
11 * and a lossy translation from UTF-8 to T.61.
18 #include <ac/stdlib.h>
20 #include <ac/socket.h>
21 #include <ac/string.h>
25 #include "ldap_utf8.h"
27 #include "ldap_defaults.h"
30 * T.61 is somewhat braindead; even in the 7-bit space it is not
31 * completely equivalent to 7-bit US-ASCII. Our definition of the
32 * character set comes from RFC 1345 with a slightly more readable
33 * rendition at http://std.dkuug.dk/i18n/charmaps/T.61-8BIT.
35 * Even though '#' and '$' are present in the 7-bit US-ASCII space,
36 * (x23 and x24, resp.) in T.61 they are mapped to 8-bit characters
47 * In T.61, the codes xC1 to xCF (excluding xC9, unused) are non-spacing
48 * accents of some form or another. There are predefined combinations
49 * for certain characters, but they can also be used arbitrarily. The
50 * table at dkuug.dk maps these accents to the E000 "private use" range
51 * of the Unicode space, but I believe they more properly belong in the
52 * 0300 range (non-spacing accents). The transformation is complicated
53 * slightly because Unicode wants the non-spacing character to follow
54 * the base character, while T.61 has the non-spacing character leading.
55 * Also, T.61 specifically recognizes certain combined pairs as "characters"
56 * but doesn't specify how to treat unrecognized pairs. This code will
57 * always attempt to combine pairs when a known Unicode composite exists.
60 const wchar_t ldap_t61_tab[] = {
61 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
62 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
63 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
64 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
65 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
66 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
67 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
68 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
69 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
70 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
71 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
72 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
73 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
74 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
75 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
76 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
77 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
78 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
79 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
80 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
81 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
82 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
83 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
84 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
85 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
86 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
87 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
88 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
89 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
90 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
91 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
92 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
95 typedef wchar_t wvec16[16];
96 typedef wchar_t wvec32[32];
97 typedef wchar_t wvec64[64];
99 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
100 static const wvec16 accents = {
101 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
102 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
104 /* In the following tables, base characters commented in (parentheses)
105 * are not defined by T.61 but are mapped anyway since their Unicode
109 /* Grave accented chars AEIOU (NWY) */
110 static const wvec32 c1_vec1 = {
112 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
113 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
114 static const wvec32 c1_vec2 = {
116 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
117 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
119 static const wvec32 *c1_grave[] = {
120 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
123 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
124 static const wvec32 c2_vec1 = {
126 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
127 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
128 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
129 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
130 static const wvec32 c2_vec2 = {
132 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
133 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
134 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
135 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
136 static const wvec32 c2_vec3 = {
138 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
141 static const wvec32 *c2_acute[] = {
142 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
145 /* Circumflex AEIOUYCGHJSW (Z) */
146 static const wvec32 c3_vec1 = {
148 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
149 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
150 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
151 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
152 static const wvec32 c3_vec2 = {
154 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
155 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
156 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
157 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
158 static const wvec32 *c3_circumflex[] = {
159 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
162 /* Tilde AIOUN (EVY) */
163 static const wvec32 c4_vec1 = {
165 0, 0xc5, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
166 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
167 static const wvec32 c4_vec2 = {
169 0, 0xe5, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
170 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
171 static const wvec32 *c4_tilde[] = {
172 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
175 /* Macron AEIOU (YG) */
176 static const wvec32 c5_vec1 = {
178 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
179 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
180 static const wvec32 c5_vec2 = {
182 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
183 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
184 static const wvec32 c5_vec3 = {
186 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
188 static const wvec32 *c5_macron[] = {
189 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
192 /* Breve AUG (EIO) */
193 static const wvec32 c6_vec1 = {
195 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
196 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
197 static const wvec32 c6_vec2 = {
199 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
200 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
201 static const wvec32 *c6_breve[] = {
202 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
205 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
206 static const wvec32 c7_vec1 = {
208 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
209 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
210 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
211 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
212 static const wvec32 c7_vec2 = {
214 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
215 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
216 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
217 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
218 static const wvec32 *c7_dotabove[] = {
219 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
222 /* Diaeresis AEIOUY (HWXt) */
223 static const wvec32 c8_vec1 = {
225 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
226 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
227 static const wvec32 c8_vec2 = {
229 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
230 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
231 static const wvec32 *c8_diaeresis[] = {
232 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
235 /* Ring Above AU (wy) */
236 static const wvec32 ca_vec1 = {
238 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
240 static const wvec32 ca_vec2 = {
242 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
244 static const wvec32 *ca_ringabove[] = {
245 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
248 /* Cedilla CGKLNRST (EDH) */
249 static const wvec32 cb_vec1 = {
251 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
252 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
253 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
254 static const wvec32 cb_vec2 = {
256 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
257 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
258 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
259 static const wvec32 *cb_cedilla[] = {
260 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
263 /* Double Acute Accent OU */
264 static const wvec32 cd_vec1 = {
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
267 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
268 static const wvec32 cd_vec2 = {
270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
271 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
272 static const wvec32 *cd_doubleacute[] = {
273 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
276 /* Ogonek AEIU (O) */
277 static const wvec32 ce_vec1 = {
279 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
280 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
281 static const wvec32 ce_vec2 = {
283 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
284 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
285 static const wvec32 *ce_ogonek[] = {
286 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
289 /* Caron CDELNRSTZ (AIOUGKjH) */
290 static const wvec32 cf_vec1 = {
292 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
293 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
294 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
295 0, 0, 0x17d, 0, 0, 0, 0, 0};
296 static const wvec32 cf_vec2 = {
298 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
299 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
300 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
301 0, 0, 0x17e, 0, 0, 0, 0, 0};
302 static const wvec32 *cf_caron[] = {
303 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
306 static const wvec32 **cx_tab[] = {
307 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
308 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
309 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
311 int ldap_t61s_valid( struct berval *str )
313 unsigned char *c = (unsigned char *)str->bv_val;
316 for (i=0; i < str->bv_len; c++,i++)
317 if (!ldap_t61_tab[*c])
322 /* Transform a T.61 string to UTF-8.
324 int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
331 /* Just count the length of the UTF-8 result first */
332 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
333 /* Invalid T.61 characters? */
334 if (!ldap_t61_tab[*c])
335 return LDAP_INVALID_SYNTAX;
336 if (*c & 0xf0 == 0xc0) {
338 /* If this is the end of the string, or if the base
339 * character is just a space, treat this as a regular
342 if ((!c[1] || c[1] == 0x20) && accents[j]) {
343 wlen += ldap_x_wc_to_utf8(NULL, accents[j], 0);
344 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
345 /* We have a composite mapping for this pair */
346 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
347 wlen += ldap_x_wc_to_utf8( NULL,
348 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 0);
350 /* No mapping, just swap it around so the base
351 * character comes first.
353 wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
354 wlen += ldap_x_wc_to_utf8(NULL,
355 ldap_t61_tab[*c], 0);
360 wlen += ldap_x_wc_to_utf8(NULL, ldap_t61_tab[*c], 0);
364 /* Now transform the string */
366 dst->bv_val = LDAP_MALLOC( wlen+1 );
369 return LDAP_NO_MEMORY;
371 for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
372 if (*c & 0xf0 == 0xc0) {
374 /* If this is the end of the string, or if the base
375 * character is just a space, treat this as a regular
378 if ((!c[1] || c[1] == 0x20) && accents[j]) {
379 d += ldap_x_wc_to_utf8(d, accents[j], 6);
380 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
381 /* We have a composite mapping for this pair */
382 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
383 d += ldap_x_wc_to_utf8(d,
384 (*cx_tab[j][c[1]>>5])[c[1]&0x1f], 6);
386 /* No mapping, just swap it around so the base
387 * character comes first.
389 d += ldap_x_wc_to_utf8(d, c[1], 6);
390 d += ldap_x_wc_to_utf8(d, ldap_t61_tab[*c], 6);
395 d += ldap_x_wc_to_utf8(d, ldap_t61_tab[*c], 6);
402 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )