]> git.sur5r.net Git - openldap/commitdiff
Added conversion from UTF-8 to T.61.
authorHoward Chu <hyc@openldap.org>
Fri, 1 Feb 2002 03:13:04 +0000 (03:13 +0000)
committerHoward Chu <hyc@openldap.org>
Fri, 1 Feb 2002 03:13:04 +0000 (03:13 +0000)
libraries/libldap/t61.c

index cce4b795ebc5a7c9c6b9766ed4d8a357775c364c..5cb25702e87b5cdf1811efff8e7e7b0b22d197e1 100644 (file)
@@ -57,7 +57,7 @@
  * always attempt to combine pairs when a known Unicode composite exists.
  */
 
-const wchar_t ldap_t61_tab[] = {
+static const wchar_t t61_tab[] = {
        0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
        0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
        0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
@@ -162,11 +162,11 @@ static const wvec32 *c3_circumflex[] = {
 /* Tilde AIOUN (EVY) */
 static const wvec32 c4_vec1 = {
        /* Upper case */
-       0, 0xc5, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
+       0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
        0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
 static const wvec32 c4_vec2 = {
        /* Lower case */
-       0, 0xe5, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
+       0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
        0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
 static const wvec32 *c4_tilde[] = {
        NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
@@ -314,7 +314,7 @@ int ldap_t61s_valid( struct berval *str )
        int i;
 
        for (i=0; i < str->bv_len; c++,i++)
-               if (!ldap_t61_tab[*c])
+               if (!t61_tab[*c])
                        return 0;
        return 1;
 }
@@ -325,13 +325,12 @@ int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
 {
        unsigned char *c;
        char *d;
-       wchar_t tmp;
        int i, wlen = 0;
 
        /* Just count the length of the UTF-8 result first */
        for (i=0,c=(unsigned char *)src->bv_val; i < src->bv_len; c++,i++) {
                /* Invalid T.61 characters? */
-               if (!ldap_t61_tab[*c]) 
+               if (!t61_tab[*c]) 
                        return LDAP_INVALID_SYNTAX;
                if (*c & 0xf0 == 0xc0) {
                        int j = *c & 0x0f;
@@ -352,12 +351,12 @@ int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
                         */
                                wlen += ldap_x_wc_to_utf8(NULL, c[1], 0);
                                wlen += ldap_x_wc_to_utf8(NULL,
-                                       ldap_t61_tab[*c], 0);
+                                       t61_tab[*c], 0);
                        }
                        c++; i++;
                        continue;
                } else {
-                       wlen += ldap_x_wc_to_utf8(NULL, ldap_t61_tab[*c], 0);
+                       wlen += ldap_x_wc_to_utf8(NULL, t61_tab[*c], 0);
                }
        }
 
@@ -387,18 +386,292 @@ int ldap_t61s_to_utf8s( struct berval *src, struct berval *dst )
                         * character comes first.
                         */
                                d += ldap_x_wc_to_utf8(d, c[1], 6);
-                               d += ldap_x_wc_to_utf8(d, ldap_t61_tab[*c], 6);
+                               d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
                        }
                        c++; i++;
                        continue;
                } else {
-                       d += ldap_x_wc_to_utf8(d, ldap_t61_tab[*c], 6);
+                       d += ldap_x_wc_to_utf8(d, t61_tab[*c], 6);
                }
        }
-       *d = 0;
+       *d = '\0';
        return LDAP_SUCCESS;
 }
 
+/* For the reverse mapping, we just pay attention to the Latin-oriented
+ * code blocks. These are
+ *     0000 - 007f Basic Latin
+ *     0080 - 00ff Latin-1 Supplement
+ *     0100 - 017f Latin Extended-A
+ *     0180 - 024f Latin Extended-B
+ *     1e00 - 1eff Latin Extended Additional
+ *
+ * We have a special case to map Ohm U2126 back to T.61 0xe0. All other
+ * unrecognized characters are replaced with '?' 0x3f.
+ */
+
+static const wvec64 u000 = {
+       0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+       0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
+       0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
+       0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
+       0x0020, 0x0021, 0x0022, 0x00a6, 0x00a4, 0x0025, 0x0026, 0x0027,
+       0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
+       0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+       0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f};
+
+/* In this range, we've mapped caret to xc3/x20, backquote to xc1/x20,
+ * and tilde to xc4/x20. T.61 (stupidly!) doesn't define these characters
+ * on their own, even though it provides them as combiners for other
+ * letters. T.61 doesn't define these pairings either, so this may just
+ * have to be replaced with '?' 0x3f if other software can't cope with it.
+ */
+static const wvec64 u001 = {
+       0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+       0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+       0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+       0x0058, 0x0059, 0x005a, 0x005b, 0x003f, 0x005d, 0xc320, 0x005f,
+       0xc120, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+       0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
+       0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+       0x0078, 0x0079, 0x007a, 0x003f, 0x007c, 0x003f, 0xc420, 0x007f};
+
+static const wvec64 u002 = {
+       0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+       0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
+       0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+       0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
+       0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a8, 0x00a5, 0x003f, 0x00a7,
+       0xc820, 0x003f, 0x00e3, 0x00ab, 0x003f, 0x003f, 0x003f, 0xc520,
+       0x00b0, 0x00b1, 0x00b2, 0x00b3, 0xc220, 0x00b5, 0x00b6, 0x00b7,
+       0xcb20, 0x003f, 0x00eb, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf};
+
+static const wvec64 u003 = {
+       0xc141, 0xc241, 0xc341, 0xc441, 0xc841, 0xca41, 0x00e1, 0xcb43,
+       0xc145, 0xc245, 0xc345, 0xc845, 0xc149, 0xc249, 0xc349, 0xc849,
+       0x00e2, 0xc44e, 0xc14f, 0xc24f, 0xc34f, 0xc44f, 0xc84f, 0x00b4,
+       0x00e9, 0xc155, 0xc255, 0xc355, 0xc855, 0xc259, 0x00ec, 0x00fb,
+       0xc161, 0xc261, 0xc361, 0xc461, 0xc861, 0xca61, 0x00f1, 0xcb63,
+       0xc165, 0xc265, 0xc365, 0xc865, 0xc169, 0xc269, 0xc369, 0xc869,
+       0x00f3, 0xc46e, 0xc16f, 0xc26f, 0xc36f, 0xc46f, 0xc86f, 0x00b8,
+       0x00f9, 0xc175, 0xc275, 0xc375, 0xc875, 0xc279, 0x00fc, 0xc879};
+
+/* These codes are used here but not defined by T.61:
+ * x114 = xc6/x45, x115 = xc6/x65, x12c = xc6/x49, x12d = xc6/x69
+ */
+static const wvec64 u010 = {
+       0xc541, 0xc561, 0xc641, 0xc661, 0xce41, 0xce61, 0xc243, 0xc263,
+       0xc343, 0xc363, 0xc743, 0xc763, 0xcf43, 0xcf63, 0xcf44, 0xcf64,
+       0x003f, 0x00f2, 0xc545, 0xc565, 0xc645, 0xc665, 0xc745, 0xc765,
+       0xce45, 0xce65, 0xcf45, 0xcf65, 0xc347, 0xc367, 0xc647, 0xc667,
+       0xc747, 0xc767, 0xcb47, 0xcb67, 0xc348, 0xc368, 0x00e4, 0x00f4,
+       0xc449, 0xc469, 0xc549, 0xc569, 0xc649, 0xc669, 0xce49, 0xce69,
+       0xc749, 0x00f5, 0x00e6, 0x00f6, 0xc34a, 0xc36a, 0xcb4b, 0xcb6b,
+       0x00f0, 0xc24c, 0xc26c, 0xcb4c, 0xcb6c, 0xcf4c, 0xcf6c, 0x00e7};
+
+/* These codes are used here but not defined by T.61:
+ * x14e = xc6/x4f, x14f = xc6/x6f
+ */
+static const wvec64 u011 = {
+       0x00f7, 0x00e8, 0x00f8, 0xc24e, 0xc26e, 0xcb4e, 0xcb6e, 0xcf4e,
+       0xcf6e, 0x00ef, 0x00ee, 0x00fe, 0xc54f, 0xc56f, 0xc64f, 0xc66f,
+       0xcd4f, 0xcd6f, 0x00ea, 0x00fa, 0xc252, 0xc272, 0xcb52, 0xcb72,
+       0xcf52, 0xcf72, 0xc253, 0xc273, 0xc353, 0xc373, 0xcb53, 0xcb73,
+       0xcf53, 0xcf73, 0xcb54, 0xcb74, 0xcf54, 0xcf74, 0x00ed, 0x00fd,
+       0xc455, 0xc475, 0xc555, 0xc575, 0xc655, 0xc675, 0xca55, 0xca75,
+       0xcd55, 0xcd75, 0xce55, 0xce75, 0xc357, 0xc377, 0xc359, 0xc379,
+       0xc859, 0xc25a, 0xc27a, 0xc75a, 0xc77a, 0xcf5a, 0xcf7a, 0x003f};
+
+/* All of the codes in this block are undefined in T.61.
+ */
+static const wvec64 u013 = {
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf41, 0xcf61, 0xcf49,
+       0xcf69, 0xcf4f, 0xcf6f, 0xcf55, 0xcf75, 0x003f, 0x003f, 0x003f, 
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0xc5e1, 0xc5f1, 0x003f, 0x003f, 0xcf47, 0xcf67,
+       0xcf4b, 0xcf6b, 0xce4f, 0xce6f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0xcf6a, 0x003f, 0x003f, 0x003f, 0xc247, 0xc267, 0x003f, 0x003f,
+       0xc14e, 0xc16e, 0x003f, 0x003f, 0xc2e1, 0xc2f1, 0x003f, 0x003f};
+
+/* All of the codes in this block are undefined in T.61.
+ */
+static const wvec64 u020 = {
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf48, 0xcf68,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc741, 0xc761,
+       0xcb45, 0xcb65, 0x003f, 0x003f, 0x003f, 0x003f, 0xc74f, 0xc76f,
+       0x003f, 0x003f, 0xc559, 0xc579, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
+
+static const wvec64 u023 = {
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xcf20,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0xc620, 0xc720, 0xca20, 0xce20, 0x003f, 0xcd20, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
+
+/* These are the non-spacing characters by themselves. They should
+ * never appear by themselves in actual text.
+ */
+static const wvec64 u030 = {
+       0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x003f, 0x00c6, 0x00c7,
+       0x00c8, 0x003f, 0x00ca, 0x00cd, 0x00cf, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x00cb,
+       0x00ce, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x00cc, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f};
+
+/* None of the following blocks are defined in T.61.
+ */
+static const wvec64 u1e0 = {
+       0x003f, 0x003f, 0xc742, 0xc762, 0x003f, 0x003f, 0x003f, 0x003f, 
+       0x003f, 0x003f, 0xc744, 0xc764, 0x003f, 0x003f, 0x003f, 0x003f,
+       0xcb44, 0xcb64, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc746, 0xc766,
+       0xc547, 0xc567, 0xc748, 0xc768, 0x003f, 0x003f, 0xc848, 0xc868,
+       0xcb48, 0xcb68, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0xc24b, 0xc26b, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc24d, 0xc26d,
+};
+
+static const wvec64 u1e1 = {
+       0xc74d, 0xc76d, 0x003f, 0x003f, 0xc74e, 0xc76e, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0xc250, 0xc270, 0xc750, 0xc770,
+       0xc752, 0xc772, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0xc753, 0xc773, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0xc754, 0xc774, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0xc456, 0xc476, 0x003f, 0x003f,
+};
+
+static const wvec64 u1e2 = {
+       0xc157, 0xc177, 0xc257, 0xc277, 0xc857, 0xc877, 0xc757, 0xc777,
+       0x003f, 0x003f, 0xc758, 0xc778, 0xc858, 0xc878, 0xc759, 0xc779,
+       0xc35a, 0xc37a, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0xc874,
+       0xca77, 0xca79, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0xc445, 0xc465, 0x003f, 0x003f,
+};
+
+static const wvec64 u1e3 = {
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+       0x003f, 0x003f, 0xc159, 0xc179, 0x003f, 0x003f, 0x003f, 0x003f,
+       0xc459, 0xc479, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
+};
+
+static const wvec64 *wc00[] = {
+       &u000, &u001, &u002, &u003,
+       &u010, &u011, NULL, &u013,
+       &u020, NULL, NULL, &u023,
+       &u030, NULL, NULL, NULL};
+
+static const wvec64 *wc1e[] = {
+       &u1e0, &u1e1, &u1e2, &u1e3};
+
+
 int ldap_utf8s_to_t61s( struct berval *src, struct berval *dst )
 {
+       char *c, *d;
+       wchar_t tmp;
+       int i, j, tlen = 0;
+
+       /* Just count the length of the T.61 result first */
+       for (i=0,c=src->bv_val; i < src->bv_len;) {
+               j = ldap_x_utf8_to_wc( &tmp, c );
+               if (j == -1)
+                       return LDAP_INVALID_SYNTAX;
+               switch (tmp >> 8) {
+               case 0x00:
+               case 0x01:
+               case 0x02:
+               case 0x03:
+                       if (wc00[tmp >> 6] &&
+                               ((*wc00[tmp >> 6])[tmp & 0x3f] & 0xff00)) {
+                               tlen++;
+                       }
+                       tlen++;
+                       break;
+               case 0x1e:
+                       if ((*wc1e[(tmp >> 6) & 3])[tmp & 0x3f] & 0xff00) {
+                               tlen++;
+                       }
+               case 0x21:
+               default:
+                       tlen ++;
+                       break;
+               }
+               i += j;
+               c += j;
+       }
+       dst->bv_len = tlen;
+       dst->bv_val = LDAP_MALLOC( tlen+1 );
+       if (!dst->bv_val)
+               return LDAP_NO_MEMORY;
+       
+       d = dst->bv_val;
+       for (i=0,c=src->bv_val; i < src->bv_len;) {
+               j = ldap_x_utf8_to_wc( &tmp, c );
+               switch (tmp >> 8) {
+               case 0x00:
+               case 0x01:
+               case 0x02:
+                       if (wc00[tmp >> 6]) {
+                               tmp = (*wc00[tmp >> 6])[tmp & 0x3f];
+                               if (tmp & 0xff00)
+                                       *d++ = (tmp >> 8);
+                               *d++ = tmp & 0xff;
+                       } else {
+                               *d++ = 0x3f;
+                       }
+                       break;
+               case 0x03:
+                       /* swap order of non-spacing characters */
+                       if (wc00[tmp >> 6]) {
+                               wchar_t t2 = (*wc00[tmp >> 6])[tmp & 0x3f];
+                               if (t2 != 0x3f) {
+                                       d[0] = d[-1];
+                                       d[-1] = t2;
+                                       d++;
+                               } else {
+                                       *d++ = 0x3f;
+                               }
+                       } else {
+                               *d++ = 0x3f;
+                       }
+                       break;
+               case 0x1e:
+                       tmp = (*wc1e[(tmp >> 6) & 3])[tmp & 0x3f];
+                       if (tmp & 0xff00)
+                               *d++ = (tmp >> 8);
+                       *d++ = tmp & 0xff;
+                       break;
+               case 0x21:
+                       if (tmp == 0x2126) {
+                               *d++ = 0xe0;
+                               break;
+                       }
+                       /* FALLTHRU */
+               default:
+                       *d++ = 0x3f;
+                       break;
+               }
+       }
+       *d = '\0';
+       return LDAP_SUCCESS;
 }