@ -182,10 +182,14 @@ typedef unsigned char utf8leaf_t;
# define MAXCCC (254)
# define STOPPER (0)
# define DECOMPOSE (255)
# define HANGUL ((char)(255))
# define UTF8HANGULLEAF (12)
struct tree ;
static utf8leaf_t * utf8nlookup ( struct tree * , const char * , size_t ) ;
static utf8leaf_t * utf8lookup ( struct tree * , const char * ) ;
static utf8leaf_t * utf8nlookup ( struct tree * , unsigned char * ,
const char * , size_t ) ;
static utf8leaf_t * utf8lookup ( struct tree * , unsigned char * , const char * ) ;
unsigned char * utf8data ;
size_t utf8data_size ;
@ -333,6 +337,8 @@ static int utf32valid(unsigned int unichar)
return unichar < 0x110000 ;
}
# define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
# define NODE 1
# define LEAF 0
@ -463,7 +469,7 @@ static void tree_walk(struct tree *tree)
indent + 1 ) ;
leaves + = 1 ;
} else if ( node - > right ) {
assert ( node - > rightnode = = NODE ) ;
assert ( node - > rightnode = = NODE ) ;
indent + = 1 ;
node = node - > right ;
break ;
@ -857,7 +863,7 @@ static void mark_nodes(struct tree *tree)
}
}
} else if ( node - > right ) {
assert ( node - > rightnode = = NODE ) ;
assert ( node - > rightnode = = NODE ) ;
node = node - > right ;
continue ;
}
@ -909,7 +915,7 @@ static void mark_nodes(struct tree *tree)
}
}
} else if ( node - > right ) {
assert ( node - > rightnode = = NODE ) ;
assert ( node - > rightnode = = NODE ) ;
node = node - > right ;
if ( ! node - > mark & & node - > parent - > mark & &
! node - > parent - > left ) {
@ -992,7 +998,7 @@ skip:
index + = tree - > leaf_size ( node - > right ) ;
count + + ;
} else if ( node - > right ) {
assert ( node - > rightnode = = NODE ) ;
assert ( node - > rightnode = = NODE ) ;
indent + = 1 ;
node = node - > right ;
break ;
@ -1013,6 +1019,25 @@ done:
return index ;
}
/*
* Mark the nodes in a subtree , helper for size_nodes ( ) .
*/
static int mark_subtree ( struct node * node )
{
int changed ;
if ( ! node | | node - > mark )
return 0 ;
node - > mark = 1 ;
node - > index = node - > parent - > index ;
changed = 1 ;
if ( node - > leftnode = = NODE )
changed + = mark_subtree ( node - > left ) ;
if ( node - > rightnode = = NODE )
changed + = mark_subtree ( node - > right ) ;
return changed ;
}
/*
* Compute the size of nodes and leaves . We start by assuming that
* each node needs to store a three - byte offset . The indexes of the
@ -1031,6 +1056,7 @@ static int size_nodes(struct tree *tree)
unsigned int bitmask ;
unsigned int pathbits ;
unsigned int pathmask ;
unsigned int nbit ;
int changed ;
int offset ;
int size ;
@ -1058,22 +1084,40 @@ static int size_nodes(struct tree *tree)
size = 1 ;
} else {
if ( node - > rightnode = = NODE ) {
/*
* If the right node is not marked ,
* look for a corresponding node in
* the next tree . Such a node need
* not exist .
*/
right = node - > right ;
next = tree - > next ;
while ( ! right - > mark ) {
assert ( next ) ;
n = next - > root ;
while ( n - > bitnum ! = node - > bitnum ) {
if ( pathbits & ( 1 < < n - > bitnum ) )
nbit = 1 < < n - > bitnum ;
if ( ! ( pathmask & nbit ) )
break ;
if ( pathbits & nbit ) {
if ( n - > rightnode = = LEAF )
break ;
n = n - > right ;
else
} else {
if ( n - > leftnode = = LEAF )
break ;
n = n - > left ;
}
}
if ( n - > bitnum ! = node - > bitnum )
break ;
n = n - > right ;
assert ( right - > bitnum = = n - > bitnum ) ;
right = n ;
next = next - > next ;
}
/* Make sure the right node is marked. */
if ( ! right - > mark )
changed + = mark_subtree ( right ) ;
offset = right - > index - node - > index ;
} else {
offset = * tree - > leaf_index ( tree , node - > right ) ;
@ -1115,7 +1159,7 @@ skip:
if ( node - > rightnode = = LEAF ) {
assert ( node - > right ) ;
} else if ( node - > right ) {
assert ( node - > rightnode = = NODE ) ;
assert ( node - > rightnode = = NODE ) ;
indent + = 1 ;
node = node - > right ;
break ;
@ -1148,8 +1192,15 @@ static void emit(struct tree *tree, unsigned char *data)
int offset ;
int index ;
int indent ;
int size ;
int bytes ;
int leaves ;
int nodes [ 4 ] ;
unsigned char byte ;
nodes [ 0 ] = nodes [ 1 ] = nodes [ 2 ] = nodes [ 3 ] = 0 ;
leaves = 0 ;
bytes = 0 ;
index = tree - > index ;
data + = index ;
indent = 1 ;
@ -1158,7 +1209,10 @@ static void emit(struct tree *tree, unsigned char *data)
if ( tree - > childnode = = LEAF ) {
assert ( tree - > root ) ;
tree - > leaf_emit ( tree - > root , data ) ;
return ;
size = tree - > leaf_size ( tree - > root ) ;
index + = size ;
leaves + + ;
goto done ;
}
assert ( tree - > childnode = = NODE ) ;
@ -1185,6 +1239,7 @@ static void emit(struct tree *tree, unsigned char *data)
offlen = 2 ;
else
offlen = 3 ;
nodes [ offlen ] + + ;
offset = node - > offset ;
byte | = offlen < < OFFLEN_SHIFT ;
* data + + = byte ;
@ -1197,12 +1252,14 @@ static void emit(struct tree *tree, unsigned char *data)
} else if ( node - > left ) {
if ( node - > leftnode = = NODE )
byte | = TRIENODE ;
nodes [ 0 ] + + ;
* data + + = byte ;
index + + ;
} else if ( node - > right ) {
byte | = RIGHTNODE ;
if ( node - > rightnode = = NODE )
byte | = TRIENODE ;
nodes [ 0 ] + + ;
* data + + = byte ;
index + + ;
} else {
@ -1217,7 +1274,10 @@ skip:
assert ( node - > left ) ;
data = tree - > leaf_emit ( node - > left ,
data ) ;
index + = tree - > leaf_size ( node - > left ) ;
size = tree - > leaf_size ( node - > left ) ;
index + = size ;
bytes + = size ;
leaves + + ;
} else if ( node - > left ) {
assert ( node - > leftnode = = NODE ) ;
indent + = 1 ;
@ -1231,9 +1291,12 @@ skip:
assert ( node - > right ) ;
data = tree - > leaf_emit ( node - > right ,
data ) ;
index + = tree - > leaf_size ( node - > right ) ;
size = tree - > leaf_size ( node - > right ) ;
index + = size ;
bytes + = size ;
leaves + + ;
} else if ( node - > right ) {
assert ( node - > rightnode = = NODE ) ;
assert ( node - > rightnode = = NODE ) ;
indent + = 1 ;
node = node - > right ;
break ;
@ -1245,6 +1308,15 @@ skip:
indent - = 1 ;
}
}
done :
if ( verbose > 0 ) {
printf ( " Emitted %d (%d) leaves " ,
leaves , bytes ) ;
printf ( " %d (%d+%d+%d+%d) nodes " ,
nodes [ 0 ] + nodes [ 1 ] + nodes [ 2 ] + nodes [ 3 ] ,
nodes [ 0 ] , nodes [ 1 ] , nodes [ 2 ] , nodes [ 3 ] ) ;
printf ( " %d total \n " , index - tree - > index ) ;
}
}
/* ------------------------------------------------------------------ */
@ -1346,8 +1418,12 @@ static void nfdi_print(void *l, int indent)
printf ( " %*sleaf @ %p code %X ccc %d gen %d " , indent , " " , leaf ,
leaf - > code , leaf - > ccc , leaf - > gen ) ;
if ( leaf - > utf8nfdi )
if ( leaf - > utf8nfdi & & leaf - > utf8nfdi [ 0 ] = = HANGUL )
printf ( " nfdi \" %s \" " , " HANGUL SYLLABLE " ) ;
else if ( leaf - > utf8nfdi )
printf ( " nfdi \" %s \" " , ( const char * ) leaf - > utf8nfdi ) ;
printf ( " \n " ) ;
}
@ -1357,8 +1433,11 @@ static void nfdicf_print(void *l, int indent)
printf ( " %*sleaf @ %p code %X ccc %d gen %d " , indent , " " , leaf ,
leaf - > code , leaf - > ccc , leaf - > gen ) ;
if ( leaf - > utf8nfdicf )
printf ( " nfdicf \" %s \" " , ( const char * ) leaf - > utf8nfdicf ) ;
else if ( leaf - > utf8nfdi & & leaf - > utf8nfdi [ 0 ] = = HANGUL )
printf ( " nfdi \" %s \" " , " HANGUL SYLLABLE " ) ;
else if ( leaf - > utf8nfdi )
printf ( " nfdi \" %s \" " , ( const char * ) leaf - > utf8nfdi ) ;
printf ( " \n " ) ;
@ -1388,9 +1467,11 @@ static int correction_mark(void *l)
static int nfdi_size ( void * l )
{
struct unicode_data * leaf = l ;
int size = 2 ;
if ( leaf - > utf8nfdi )
if ( HANGUL_SYLLABLE ( leaf - > code ) )
size + = 1 ;
else if ( leaf - > utf8nfdi )
size + = strlen ( leaf - > utf8nfdi ) + 1 ;
return size ;
}
@ -1398,9 +1479,11 @@ static int nfdi_size(void *l)
static int nfdicf_size ( void * l )
{
struct unicode_data * leaf = l ;
int size = 2 ;
if ( leaf - > utf8nfdicf )
if ( HANGUL_SYLLABLE ( leaf - > code ) )
size + = 1 ;
else if ( leaf - > utf8nfdicf )
size + = strlen ( leaf - > utf8nfdicf ) + 1 ;
else if ( leaf - > utf8nfdi )
size + = strlen ( leaf - > utf8nfdi ) + 1 ;
@ -1427,7 +1510,11 @@ static unsigned char *nfdi_emit(void *l, unsigned char *data)
unsigned char * s ;
* data + + = leaf - > gen ;
if ( leaf - > utf8nfdi ) {
if ( HANGUL_SYLLABLE ( leaf - > code ) ) {
* data + + = DECOMPOSE ;
* data + + = HANGUL ;
} else if ( leaf - > utf8nfdi ) {
* data + + = DECOMPOSE ;
s = ( unsigned char * ) leaf - > utf8nfdi ;
while ( ( * data + + = * s + + ) ! = 0 )
@ -1444,7 +1531,11 @@ static unsigned char *nfdicf_emit(void *l, unsigned char *data)
unsigned char * s ;
* data + + = leaf - > gen ;
if ( leaf - > utf8nfdicf ) {
if ( HANGUL_SYLLABLE ( leaf - > code ) ) {
* data + + = DECOMPOSE ;
* data + + = HANGUL ;
} else if ( leaf - > utf8nfdicf ) {
* data + + = DECOMPOSE ;
s = ( unsigned char * ) leaf - > utf8nfdicf ;
while ( ( * data + + = * s + + ) ! = 0 )
@ -1467,6 +1558,11 @@ static void utf8_create(struct unicode_data *data)
unsigned int * um ;
int i ;
if ( data - > utf8nfdi ) {
assert ( data - > utf8nfdi [ 0 ] = = HANGUL ) ;
return ;
}
u = utf ;
um = data - > utf32nfdi ;
if ( um ) {
@ -1652,6 +1748,7 @@ static void verify(struct tree *tree)
utf8leaf_t * leaf ;
unsigned int unichar ;
char key [ 4 ] ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
int report ;
int nocf ;
@ -1665,7 +1762,8 @@ static void verify(struct tree *tree)
if ( data - > correction < = tree - > maxage )
data = & unicode_data [ unichar ] ;
utf8encode ( key , unichar ) ;
leaf = utf8lookup ( tree , key ) ;
leaf = utf8lookup ( tree , hangul , key ) ;
if ( ! leaf ) {
if ( data - > gen ! = - 1 )
report + + ;
@ -1679,7 +1777,10 @@ static void verify(struct tree *tree)
if ( data - > gen ! = LEAF_GEN ( leaf ) )
report + + ;
if ( LEAF_CCC ( leaf ) = = DECOMPOSE ) {
if ( nocf ) {
if ( HANGUL_SYLLABLE ( data - > code ) ) {
if ( data - > utf8nfdi [ 0 ] ! = HANGUL )
report + + ;
} else if ( nocf ) {
if ( ! data - > utf8nfdi ) {
report + + ;
} else if ( strcmp ( data - > utf8nfdi ,
@ -2323,8 +2424,7 @@ static void corrections_init(void)
*
*/
static void
hangul_decompose ( void )
static void hangul_decompose ( void )
{
unsigned int sb = 0xAC00 ;
unsigned int lb = 0x1100 ;
@ -2368,6 +2468,15 @@ hangul_decompose(void)
memcpy ( um , mapping , i * sizeof ( unsigned int ) ) ;
unicode_data [ unichar ] . utf32nfdicf = um ;
/*
* Add a cookie as a reminder that the hangul syllable
* decompositions must not be stored in the generated
* trie .
*/
unicode_data [ unichar ] . utf8nfdi = malloc ( 2 ) ;
unicode_data [ unichar ] . utf8nfdi [ 0 ] = HANGUL ;
unicode_data [ unichar ] . utf8nfdi [ 1 ] = ' \0 ' ;
if ( verbose > 1 )
print_utf32nfdi ( unichar ) ;
@ -2493,6 +2602,99 @@ int utf8cursor(struct utf8cursor *, struct tree *, const char *);
int utf8ncursor ( struct utf8cursor * , struct tree * , const char * , size_t ) ;
int utf8byte ( struct utf8cursor * ) ;
/*
* Hangul decomposition ( algorithm from Section 3.12 of Unicode 6.3 .0 )
*
* AC00 ; < Hangul Syllable , First > ; Lo ; 0 ; L ; ; ; ; ; N ; ; ; ; ;
* D7A3 ; < Hangul Syllable , Last > ; Lo ; 0 ; L ; ; ; ; ; N ; ; ; ; ;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 ( VCount * TCount )
* SCount = 11172 ( LCount * NCount )
*
* Decomposition :
* SIndex = s - SBase
*
* LV ( Canonical / Full )
* LIndex = SIndex / NCount
* VIndex = ( Sindex % NCount ) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT ( Canonical )
* LVIndex = ( SIndex / TCount ) * TCount
* TIndex = ( Sindex % TCount )
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT ( Full )
* LIndex = SIndex / NCount
* VIndex = ( Sindex % NCount ) / TCount
* TIndex = ( Sindex % TCount )
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if ( TIndex = = 0 ) {
* d = < LPart , VPart >
* } else {
* TPart = TBase + TIndex
* d = < LPart , VPart , TPart >
* }
*/
/* Constants */
# define SB (0xAC00)
# define LB (0x1100)
# define VB (0x1161)
# define TB (0x11A7)
# define LC (19)
# define VC (21)
# define TC (28)
# define NC (VC * TC)
# define SC (LC * NC)
/* Algorithmic decomposition of hangul syllable. */
static utf8leaf_t * utf8hangul ( const char * str , unsigned char * hangul )
{
unsigned int si ;
unsigned int li ;
unsigned int vi ;
unsigned int ti ;
unsigned char * h ;
/* Calculate the SI, LI, VI, and TI values. */
si = utf8decode ( str ) - SB ;
li = si / NC ;
vi = ( si % NC ) / TC ;
ti = si % TC ;
/* Fill in base of leaf. */
h = hangul ;
LEAF_GEN ( h ) = 2 ;
LEAF_CCC ( h ) = DECOMPOSE ;
h + = 2 ;
/* Add LPart, a 3-byte UTF-8 sequence. */
h + = utf8encode ( ( char * ) h , li + LB ) ;
/* Add VPart, a 3-byte UTF-8 sequence. */
h + = utf8encode ( ( char * ) h , vi + VB ) ;
/* Add TPart if required, also a 3-byte UTF-8 sequence. */
if ( ti )
h + = utf8encode ( ( char * ) h , ti + TB ) ;
/* Terminate string. */
h [ 0 ] = ' \0 ' ;
return hangul ;
}
/*
* Use trie to scan s , touching at most len bytes .
* Returns the leaf if one exists , NULL otherwise .
@ -2501,7 +2703,8 @@ int utf8byte(struct utf8cursor *);
* is well - formed and corresponds to a known unicode code point . The
* shorthand for this will be " is valid UTF-8 unicode " .
*/
static utf8leaf_t * utf8nlookup ( struct tree * tree , const char * s , size_t len )
static utf8leaf_t * utf8nlookup ( struct tree * tree , unsigned char * hangul ,
const char * s , size_t len )
{
utf8trie_t * trie ;
int offlen ;
@ -2558,6 +2761,14 @@ static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
}
}
}
/*
* Hangul decomposition is done algorithmically . These are the
* codepoints > = 0xAC00 and < = 0xD7A3 . Their UTF - 8 encoding is
* always 3 bytes long , so s has been advanced twice , and the
* start of the sequence is at s - 2.
*/
if ( LEAF_CCC ( trie ) = = DECOMPOSE & & LEAF_STR ( trie ) [ 0 ] = = HANGUL )
trie = utf8hangul ( s - 2 , hangul ) ;
return trie ;
}
@ -2567,9 +2778,10 @@ static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
*
* Forwards to trie_nlookup ( ) .
*/
static utf8leaf_t * utf8lookup ( struct tree * tree , const char * s )
static utf8leaf_t * utf8lookup ( struct tree * tree , unsigned char * hangul ,
const char * s )
{
return utf8nlookup ( tree , s , ( size_t ) - 1 ) ;
return utf8nlookup ( tree , hangul , s , ( size_t ) - 1 ) ;
}
/*
@ -2593,11 +2805,14 @@ int utf8agemax(struct tree *tree, const char *s)
utf8leaf_t * leaf ;
int age = 0 ;
int leaf_age ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
if ( ! tree )
return - 1 ;
while ( * s ) {
if ( ! ( leaf = utf8lookup ( tree , s ) ) )
leaf = utf8lookup ( tree , hangul , s ) ;
if ( ! leaf )
return - 1 ;
leaf_age = ages [ LEAF_GEN ( leaf ) ] ;
if ( leaf_age < = tree - > maxage & & leaf_age > age )
@ -2617,12 +2832,14 @@ int utf8agemin(struct tree *tree, const char *s)
utf8leaf_t * leaf ;
int age ;
int leaf_age ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
if ( ! tree )
return - 1 ;
age = tree - > maxage ;
while ( * s ) {
if ( ! ( leaf = utf8lookup ( tree , s ) ) )
leaf = utf8lookup ( tree , hangul , s ) ;
if ( ! leaf )
return - 1 ;
leaf_age = ages [ LEAF_GEN ( leaf ) ] ;
if ( leaf_age < = tree - > maxage & & leaf_age < age )
@ -2641,11 +2858,14 @@ int utf8nagemax(struct tree *tree, const char *s, size_t len)
utf8leaf_t * leaf ;
int age = 0 ;
int leaf_age ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
if ( ! tree )
return - 1 ;
while ( len & & * s ) {
if ( ! ( leaf = utf8nlookup ( tree , s , len ) ) )
leaf = utf8nlookup ( tree , hangul , s , len ) ;
if ( ! leaf )
return - 1 ;
leaf_age = ages [ LEAF_GEN ( leaf ) ] ;
if ( leaf_age < = tree - > maxage & & leaf_age > age )
@ -2665,12 +2885,14 @@ int utf8nagemin(struct tree *tree, const char *s, size_t len)
utf8leaf_t * leaf ;
int leaf_age ;
int age ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
if ( ! tree )
return - 1 ;
age = tree - > maxage ;
while ( len & & * s ) {
if ( ! ( leaf = utf8nlookup ( tree , s , len ) ) )
leaf = utf8nlookup ( tree , hangul , s , len ) ;
if ( ! leaf )
return - 1 ;
leaf_age = ages [ LEAF_GEN ( leaf ) ] ;
if ( leaf_age < = tree - > maxage & & leaf_age < age )
@ -2691,11 +2913,13 @@ ssize_t utf8len(struct tree *tree, const char *s)
{
utf8leaf_t * leaf ;
size_t ret = 0 ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
if ( ! tree )
return - 1 ;
while ( * s ) {
if ( ! ( leaf = utf8lookup ( tree , s ) ) )
leaf = utf8lookup ( tree , hangul , s ) ;
if ( ! leaf )
return - 1 ;
if ( ages [ LEAF_GEN ( leaf ) ] > tree - > maxage )
ret + = utf8clen ( s ) ;
@ -2716,11 +2940,13 @@ ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t * leaf ;
size_t ret = 0 ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
if ( ! tree )
return - 1 ;
while ( len & & * s ) {
if ( ! ( leaf = utf8nlookup ( tree , s , len ) ) )
leaf = utf8nlookup ( tree , hangul , s , len ) ;
if ( ! leaf )
return - 1 ;
if ( ages [ LEAF_GEN ( leaf ) ] > tree - > maxage )
ret + = utf8clen ( s ) ;
@ -2748,6 +2974,7 @@ struct utf8cursor {
short int ccc ;
short int nccc ;
unsigned int unichar ;
unsigned char hangul [ UTF8HANGULLEAF ] ;
} ;
/*
@ -2855,10 +3082,12 @@ int utf8byte(struct utf8cursor *u8c)
}
/* Look up the data for the current character. */
if ( u8c - > p )
leaf = utf8lookup ( u8c - > tree , u8c - > s ) ;
else
leaf = utf8nlookup ( u8c - > tree , u8c - > s , u8c - > len ) ;
if ( u8c - > p ) {
leaf = utf8lookup ( u8c - > tree , u8c - > hangul , u8c - > s ) ;
} else {
leaf = utf8nlookup ( u8c - > tree , u8c - > hangul ,
u8c - > s , u8c - > len ) ;
}
/* No leaf found implies that the input is a binary blob. */
if ( ! leaf )
@ -2878,7 +3107,7 @@ int utf8byte(struct utf8cursor *u8c)
ccc = STOPPER ;
goto ccc_mismatch ;
}
leaf = utf8lookup ( u8c - > tree , u8c - > s ) ;
leaf = utf8lookup ( u8c - > tree , u8c - > hangul , u8c - > s ) ;
ccc = LEAF_CCC ( leaf ) ;
}
u8c - > unichar = utf8decode ( u8c - > s ) ;