#
-# $Id: README,v 1.32 1999/11/29 16:41:05 mleisher Exp $
+# $Id: README,v 1.33 2001/01/02 18:46:19 mleisher Exp $
#
- MUTT UCData Package 2.4
+ MUTT UCData Package 2.5
-----------------------
This is a package that supports ctype-like operations for Unicode UCS-2 text
A. case.dat - the case mappings.
B. ctype.dat - the character property tables.
- C. decomp.dat - the character decompositions.
- D. cmbcl.dat - the non-zero combining classes.
- E. num.dat - the codes representing numbers.
+ C. comp.dat - the character composition pairs.
+ D. decomp.dat - the character decompositions.
+ E. cmbcl.dat - the non-zero combining classes.
+ F. num.dat - the codes representing numbers.
2. The "ucdata.[ch]" files which implement the functions needed to
check to see if a character matches groups of properties, to map between
The data is almost all stored as unsigned longs (32-bits assumed) and the
routines that load the data take care of endian swaps when necessary. This
-also means that surrogates (>= 0x10000) can be placed in the data files the
-"ucgendat" program parses.
+also means that supplementary characters (>= 0x10000) can be placed in the
+data files the "ucgendat" program parses.
-The data is written as external files and broken into five parts so it can be
+The data is written as external files and broken into six parts so it can be
selectively updated at runtime if necessary.
The data files currently generated from the "ucgendat" program total about 56K
CHANGES
=======
+Version 2.5
+-----------
+1. Changed the number lookup to set the denominator to 1 in cases of digits.
+ This restores functional compatibility with John Cowan's UCType package.
+
+2. Added support for the AL property.
+
+3. Modified load and reload functions to return error codes.
Version 2.4
-----------
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
error and an initialization error.
+
+Thanks go to Stig Venaas <Stig.Venaas@uninett.no> for providing a patch to
+support return types on load and reload, and for major updates to handle
+canonical composition and decomposition.
#
-# $Id: api.txt,v 1.2 1999/11/19 15:24:29 mleisher Exp $
+# $Id: api.txt,v 1.3 2001/01/02 18:46:20 mleisher Exp $
#
The MUTT UCData API
#define UCDATA_DECOMP 0x04
#define UCDATA_CMBCL 0x08
#define UCDATA_NUM 0x10
+#define UCDATA_COMP 0x20
#define UCATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
- UCDATA_CMBCL|UCDATA_NUM)
+ UCDATA_CMBCL|UCDATA_NUM|UCDATA_COMP)
-----------------------------------------------------------------------------
void ucdata_load(char *paths, int masks)
putchar('\n');
}
+int uccanondecomp(const unsigned long *in, int inlen, unsigned long **out,
+ int *outlen)
+
+ This function decomposes an input string and does canonical reordering of
+ the characters at the same time.
+
+ If a -1 is returned, memory allocation was not successful. If a zero is
+ returned, no decomposition occured. Any other value means the output string
+ contains the fully decomposed string in canonical order.
+
+ If the "outlen" parameter comes back with a value > 0, then the string
+ returned in the "out" parameter needs to be deallocated by the caller.
+
-----------------------------------------------------------------------------
int ucdecomp_hangul(unsigned long code, unsigned long *num,
-----------------------------------------------------------------------------
+int uccomp(unsigned long ch1, unsigned long ch2, unsigned long *comp)
+
+ This function takes a pair of characters and determines if they combine to
+ form another character.
+
+ If a zero is returned, no composition is formed by the character pair. Any
+ other value indicates the "comp" parameter has a value.
+
+int uccomp_hangul(unsigned long *str, int len)
+
+ This function composes the Hangul Jamo in the string. The composition is
+ done in-place.
+
+ The return value provides the new length of the string. This will be
+ smaller than "len" if compositions occured.
+
+int uccanoncomp(unsigned long *str, int len)
+
+ This function does a canonical composition of characters in the string.
+
+ The return value is the new length of the string.
+
+-----------------------------------------------------------------------------
+
struct ucnumber {
int numerator;
int denominator;
#
-# $Id: format.txt,v 1.1 1998/07/24 15:17:21 mleisher Exp $
+# $Id: format.txt,v 1.2 2001/01/02 18:46:20 mleisher Exp $
#
CHARACTER DATA
distribution of mappings may be more or less than 21845 per table, but only
65536 are allowed.
+COMPOSITIONS
+============
+
+This data file is called "comp.dat" and contains data that tracks character
+pairs that have a single Unicode value representing the combination of the two
+characters.
+
+The format for the binary form of this table is:
+
+ unsigned short ByteOrderMark
+ unsigned short NumCompositionNodes, count of composition nodes
+ unsigned long Bytes, total number of bytes used for composition nodes
+ unsigned long CompositionNodes[NumCompositionNodes * 4]
+
+ If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+ same way as described in the CHARACTER PROPERTIES section.
+
+ The CompositionNodes[] array consists of groups of 4 unsigned longs. The
+ first of these is the character code representing the combination of two
+ other character codes, the second records the number of character codes that
+ make up the composition (not currently used), and the last two are the pair
+ of character codes whose combination is represented by the character code in
+ the first field.
+
DECOMPOSITIONS
==============
.\"
-.\" $Id: ucdata.man,v 1.4 1999/11/19 16:08:33 mleisher Exp $
+.\" $Id: ucdata.man,v 1.5 2001/01/02 18:46:20 mleisher Exp $
.\"
-.TH ucdata 3 "19 November 1999"
+.TH ucdata 3 "03 January 2001"
.SH NAME
ucdata \- package for providing Unicode/ISO10646 character information
.sp
int ucdecomp(unsigned long code, unsigned long *num, unsigned long **decomp)
.sp
+int uccanondecomp(const unsigned long *in, int inlen, unsigned long **out,
+int *outlen)
+.sp
int ucdecomp_hangul(unsigned long code, unsigned long *num,
unsigned long decomp[])
.sp
+int uccomp(unsigned long ch1, unsigned long ch2, unsigned long *comp)
+.sp
+int uccomp_hangul(unsigned long *str, int len)
+.sp
+int uccanoncomp(unsiged long *str, int len)
.nf
struct ucnumber {
int numerator;
putchar('\n');
}
.TP 4
+.BR uccanondecomp()
+This function will decompose a string, insuring the characters are in
+canonical order for comparison.
+.sp
+If a decomposed string is returned, the caller is responsible for deallocating
+the string.
+.sp
+If a -1 is returned, memory allocation failed. If a zero is returned, no
+decomposition was done. Any other value means a decomposition string was
+created and the values returned in the `out' and `outlen' parameters.
+.TP 4
.BR ucdecomp_hangul()
This function determines if a Hangul syllable has a
decomposition and returns the decomposition information.
putchar('\n');
}
.TP 4
+.BR uccomp()
+This function determines if a pair of characters have a composition, and
+returns that composition if one exists.
+.sp
+A zero is returned is no composition exists for the character pair. Any other
+value indicates the `comp' field holds the character code representing the
+composition of the two character codes.
+.TP 4
+.BR uccomp_hangul()
+This composes the Hangul Jamo in-place in the string.
+.sp
+The returned value is the new length of the string.
+.TP 4
+.BR uccanoncomp()
+This function does a full composition in-place in the string, including the
+Hangul composition.
+.sp
+The returned value is the new length of the string.
+.TP 4
.BR ucnumber_lookup()
This function determines if the code is a number and
fills in the `num' field with the numerator and
Kent Johnson <kent@pondview.mv.com>
.br
Valeriy E. Ushakov <uwe@ptc.spbu.ru>
+.br
+Stig Venaas <Stig.Venaas@uninett.no>
.SH AUTHOR
Mark Leisher