Orange Pi5 kernel

Deprecated Linux kernel 5.10.110 for OrangePi 5/5B/5+ boards

3 Commits   0 Branches   0 Tags
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    2)  * Copyright (c) 2014 SGI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    3)  * All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    4)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    5)  * This program is free software; you can redistribute it and/or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    6)  * modify it under the terms of the GNU General Public License as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    7)  * published by the Free Software Foundation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    8)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300    9)  * This program is distributed in the hope that it would be useful,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   10)  * but WITHOUT ANY WARRANTY; without even the implied warranty of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   11)  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   12)  * GNU General Public License for more details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   13)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   14)  * You should have received a copy of the GNU General Public License
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   15)  * along with this program; if not, write the Free Software Foundation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   16)  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   17)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   18) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   19) /* Generator for a compact trie for unicode normalization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   20) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   21) #include <sys/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   22) #include <stddef.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   23) #include <stdlib.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   24) #include <stdio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   25) #include <assert.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   26) #include <string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   27) #include <unistd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   28) #include <errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   29) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   30) /* Default names of the in- and output files. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   31) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   32) #define AGE_NAME	"DerivedAge.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   33) #define CCC_NAME	"DerivedCombiningClass.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   34) #define PROP_NAME	"DerivedCoreProperties.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   35) #define DATA_NAME	"UnicodeData.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   36) #define FOLD_NAME	"CaseFolding.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   37) #define NORM_NAME	"NormalizationCorrections.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   38) #define TEST_NAME	"NormalizationTest.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   39) #define UTF8_NAME	"utf8data.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   40) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   41) const char	*age_name  = AGE_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   42) const char	*ccc_name  = CCC_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   43) const char	*prop_name = PROP_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   44) const char	*data_name = DATA_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   45) const char	*fold_name = FOLD_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   46) const char	*norm_name = NORM_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   47) const char	*test_name = TEST_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   48) const char	*utf8_name = UTF8_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   49) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   50) int verbose = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   51) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   52) /* An arbitrary line size limit on input lines. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   53) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   54) #define LINESIZE	1024
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   55) char line[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   56) char buf0[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   57) char buf1[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   58) char buf2[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   59) char buf3[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   60) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   61) const char *argv0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   62) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   63) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   64) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   65) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   66) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   67) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   68)  * Unicode version numbers consist of three parts: major, minor, and a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   69)  * revision.  These numbers are packed into an unsigned int to obtain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   70)  * a single version number.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   71)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   72)  * To save space in the generated trie, the unicode version is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   73)  * stored directly, instead we calculate a generation number from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   74)  * unicode versions seen in the DerivedAge file, and use that as an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   75)  * index into a table of unicode versions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   76)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   77) #define UNICODE_MAJ_SHIFT		(16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   78) #define UNICODE_MIN_SHIFT		(8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   79) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   80) #define UNICODE_MAJ_MAX			((unsigned short)-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   81) #define UNICODE_MIN_MAX			((unsigned char)-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   82) #define UNICODE_REV_MAX			((unsigned char)-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   83) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   84) #define UNICODE_AGE(MAJ,MIN,REV)			\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   85) 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   86) 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   87) 	 ((unsigned int)(REV)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   88) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   89) unsigned int *ages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   90) int ages_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   91) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   92) unsigned int unicode_maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   93) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   94) static int age_valid(unsigned int major, unsigned int minor,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   95) 		     unsigned int revision)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   96) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   97) 	if (major > UNICODE_MAJ_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   98) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300   99) 	if (minor > UNICODE_MIN_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  100) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  101) 	if (revision > UNICODE_REV_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  102) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  103) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  104) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  105) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  106) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  107) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  108) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  109)  * utf8trie_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  110)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  111)  * A compact binary tree, used to decode UTF-8 characters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  112)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  113)  * Internal nodes are one byte for the node itself, and up to three
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  114)  * bytes for an offset into the tree.  The first byte contains the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  115)  * following information:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  116)  *  NEXTBYTE  - flag        - advance to next byte if set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  117)  *  BITNUM    - 3 bit field - the bit number to tested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  118)  *  OFFLEN    - 2 bit field - number of bytes in the offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  119)  * if offlen == 0 (non-branching node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  120)  *  RIGHTPATH - 1 bit field - set if the following node is for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  121)  *                            right-hand path (tested bit is set)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  122)  *  TRIENODE  - 1 bit field - set if the following node is an internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  123)  *                            node, otherwise it is a leaf node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  124)  * if offlen != 0 (branching node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  125)  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  126)  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  127)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  128)  * Due to the way utf8 works, there cannot be branching nodes with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  129)  * NEXTBYTE set, and moreover those nodes always have a righthand
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  130)  * descendant.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  131)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  132) typedef unsigned char utf8trie_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  133) #define BITNUM		0x07
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  134) #define NEXTBYTE	0x08
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  135) #define OFFLEN		0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  136) #define OFFLEN_SHIFT	4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  137) #define RIGHTPATH	0x40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  138) #define TRIENODE	0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  139) #define RIGHTNODE	0x40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  140) #define LEFTNODE	0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  141) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  143)  * utf8leaf_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  144)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  145)  * The leaves of the trie are embedded in the trie, and so the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  146)  * underlying datatype, unsigned char.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  147)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  148)  * leaf[0]: The unicode version, stored as a generation number that is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  149)  *          an index into utf8agetab[].  With this we can filter code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  150)  *          points based on the unicode version in which they were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  151)  *          defined.  The CCC of a non-defined code point is 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  152)  * leaf[1]: Canonical Combining Class. During normalization, we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  153)  *          to do a stable sort into ascending order of all characters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  154)  *          with a non-zero CCC that occur between two characters with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  155)  *          a CCC of 0, or at the begin or end of a string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  156)  *          The unicode standard guarantees that all CCC values are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  157)  *          between 0 and 254 inclusive, which leaves 255 available as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  158)  *          a special value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  159)  *          Code points with CCC 0 are known as stoppers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  160)  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  161)  *          start of a NUL-terminated string that is the decomposition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  162)  *          of the character.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  163)  *          The CCC of a decomposable character is the same as the CCC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  164)  *          of the first character of its decomposition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  165)  *          Some characters decompose as the empty string: these are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  166)  *          characters with the Default_Ignorable_Code_Point property.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  167)  *          These do affect normalization, as they all have CCC 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  168)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  169)  * The decompositions in the trie have been fully expanded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  170)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  171)  * Casefolding, if applicable, is also done using decompositions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  172)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  173) typedef unsigned char utf8leaf_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  174) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  175) #define LEAF_GEN(LEAF)	((LEAF)[0])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  176) #define LEAF_CCC(LEAF)	((LEAF)[1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  177) #define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  178) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  179) #define MAXGEN		(255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  180) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  181) #define MINCCC		(0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  182) #define MAXCCC		(254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  183) #define STOPPER		(0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  184) #define DECOMPOSE	(255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  185) #define HANGUL		((char)(255))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  186) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  187) #define UTF8HANGULLEAF	(12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  189) struct tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  190) static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  191) 			       const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  192) static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  194) unsigned char *utf8data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  195) size_t utf8data_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  196) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  197) utf8trie_t *nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  198) utf8trie_t *nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  199) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  200) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  201) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  202) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  203)  * UTF8 valid ranges.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  204)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  205)  * The UTF-8 encoding spreads the bits of a 32bit word over several
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  206)  * bytes. This table gives the ranges that can be held and how they'd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  207)  * be represented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  208)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  209)  * 0x00000000 0x0000007F: 0xxxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  210)  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  211)  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  212)  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  213)  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  214)  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  215)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  216)  * There is an additional requirement on UTF-8, in that only the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  217)  * shortest representation of a 32bit value is to be used.  A decoder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  218)  * must not decode sequences that do not satisfy this requirement.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  219)  * Thus the allowed ranges have a lower bound.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  220)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  221)  * 0x00000000 0x0000007F: 0xxxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  222)  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  223)  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  224)  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  225)  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  226)  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  227)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  228)  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  229)  * 17 planes of 65536 values.  This limits the sequences actually seen
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  230)  * even more, to just the following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  231)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  232)  *          0 -     0x7f: 0                     0x7f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  233)  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  234)  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  235)  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  236)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  237)  * Even within those ranges not all values are allowed: the surrogates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  238)  * 0xd800 - 0xdfff should never be seen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  239)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  240)  * Note that the longest sequence seen with valid usage is 4 bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  241)  * the same a single UTF-32 character.  This makes the UTF-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  242)  * representation of Unicode strictly smaller than UTF-32.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  243)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  244)  * The shortest sequence requirement was introduced by:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  245)  *    Corrigendum #1: UTF-8 Shortest Form
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  246)  * It can be found here:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  247)  *    http://www.unicode.org/versions/corrigendum1.html
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  248)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  249)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  250) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  251) #define UTF8_2_BITS     0xC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  252) #define UTF8_3_BITS     0xE0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  253) #define UTF8_4_BITS     0xF0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  254) #define UTF8_N_BITS     0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  255) #define UTF8_2_MASK     0xE0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  256) #define UTF8_3_MASK     0xF0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  257) #define UTF8_4_MASK     0xF8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  258) #define UTF8_N_MASK     0xC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  259) #define UTF8_V_MASK     0x3F
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  260) #define UTF8_V_SHIFT    6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  261) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  262) static int utf8encode(char *str, unsigned int val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  263) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  264) 	int len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  265) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  266) 	if (val < 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  267) 		str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  268) 		len = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  269) 	} else if (val < 0x800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  270) 		str[1] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  271) 		str[1] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  272) 		val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  273) 		str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  274) 		str[0] |= UTF8_2_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  275) 		len = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  276) 	} else if (val < 0x10000) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  277) 		str[2] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  278) 		str[2] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  279) 		val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  280) 		str[1] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  281) 		str[1] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  282) 		val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  283) 		str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  284) 		str[0] |= UTF8_3_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  285) 		len = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  286) 	} else if (val < 0x110000) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  287) 		str[3] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  288) 		str[3] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  289) 		val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  290) 		str[2] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  291) 		str[2] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  292) 		val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  293) 		str[1] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  294) 		str[1] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  295) 		val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  296) 		str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  297) 		str[0] |= UTF8_4_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  298) 		len = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  299) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  300) 		printf("%#x: illegal val\n", val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  301) 		len = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  302) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  303) 	return len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  305) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  306) static unsigned int utf8decode(const char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  308) 	const unsigned char *s = (const unsigned char*)str;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  309) 	unsigned int unichar = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  310) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  311) 	if (*s < 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  312) 		unichar = *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  313) 	} else if (*s < UTF8_3_BITS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  314) 		unichar = *s++ & 0x1F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  315) 		unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  316) 		unichar |= *s & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  317) 	} else if (*s < UTF8_4_BITS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  318) 		unichar = *s++ & 0x0F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  319) 		unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  320) 		unichar |= *s++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  321) 		unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  322) 		unichar |= *s & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  323) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  324) 		unichar = *s++ & 0x0F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  325) 		unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  326) 		unichar |= *s++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  327) 		unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  328) 		unichar |= *s++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  329) 		unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  330) 		unichar |= *s & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  331) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  332) 	return unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  334) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  335) static int utf32valid(unsigned int unichar)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  336) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  337) 	return unichar < 0x110000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  339) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  340) #define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  341) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  342) #define NODE 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  343) #define LEAF 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  344) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  345) struct tree {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  346) 	void *root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  347) 	int childnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  348) 	const char *type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  349) 	unsigned int maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  350) 	struct tree *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  351) 	int (*leaf_equal)(void *, void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  352) 	void (*leaf_print)(void *, int);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  353) 	int (*leaf_mark)(void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  354) 	int (*leaf_size)(void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  355) 	int *(*leaf_index)(struct tree *, void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  356) 	unsigned char *(*leaf_emit)(void *, unsigned char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  357) 	int leafindex[0x110000];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  358) 	int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  359) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  360) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  361) struct node {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  362) 	int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  363) 	int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  364) 	int mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  365) 	int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  366) 	struct node *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  367) 	void *left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  368) 	void *right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  369) 	unsigned char bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  370) 	unsigned char nextbyte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  371) 	unsigned char leftnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  372) 	unsigned char rightnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  373) 	unsigned int keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  374) 	unsigned int keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  375) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  376) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  377) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  378)  * Example lookup function for a tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  379)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  380) static void *lookup(struct tree *tree, const char *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  381) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  382) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  383) 	void *leaf = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  384) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  385) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  386) 	while (!leaf && node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  387) 		if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  388) 			key++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  389) 		if (*key & (1 << (node->bitnum & 7))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  390) 			/* Right leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  391) 			if (node->rightnode == NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  392) 				node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  393) 			} else if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  394) 				leaf = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  395) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  396) 				node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  397) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  398) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  399) 			/* Left leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  400) 			if (node->leftnode == NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  401) 				node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  402) 			} else if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  403) 				leaf = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  404) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  405) 				node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  406) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  407) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  408) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  409) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  410) 	return leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  412) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  414)  * A simple non-recursive tree walker: keep track of visits to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  415)  * left and right branches in the leftmask and rightmask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  416)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  417) static void tree_walk(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  418) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  419) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  420) 	unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  421) 	unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  422) 	unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  423) 	int indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  424) 	int nodes, singletons, leaves;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  425) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  426) 	nodes = singletons = leaves = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  427) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  428) 	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  429) 	if (tree->childnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  430) 		assert(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  431) 		tree->leaf_print(tree->root, indent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  432) 		leaves = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  433) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  434) 		assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  435) 		node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  436) 		leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  437) 		while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  438) 			printf("%*snode @ %p bitnum %d nextbyte %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  439) 			       " left %p right %p mask %x bits %x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  440) 				indent, "", node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  441) 				node->bitnum, node->nextbyte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  442) 				node->left, node->right,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  443) 				node->keymask, node->keybits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  444) 			nodes += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  445) 			if (!(node->left && node->right))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  446) 				singletons += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  447) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  448) 			while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  449) 				bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  450) 				if ((leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  451) 					leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  452) 					if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  453) 						assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  454) 						tree->leaf_print(node->left,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  455) 								 indent+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  456) 						leaves += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  457) 					} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  458) 						assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  459) 						indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  460) 						node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  461) 						break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  462) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  463) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  464) 				if ((rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  465) 					rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  466) 					if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  467) 						assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  468) 						tree->leaf_print(node->right,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  469) 								 indent+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  470) 						leaves += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  471) 					} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  472) 						assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  473) 						indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  474) 						node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  475) 						break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  476) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  477) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  478) 				leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  479) 				rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  480) 				node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  481) 				indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  482) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  483) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  484) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  485) 	printf("nodes %d leaves %d singletons %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  486) 	       nodes, leaves, singletons);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  488) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  489) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  490)  * Allocate an initialize a new internal node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  491)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  492) static struct node *alloc_node(struct node *parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  493) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  494) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  495) 	int bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  496) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  497) 	node = malloc(sizeof(*node));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  498) 	node->left = node->right = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  499) 	node->parent = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  500) 	node->leftnode = NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  501) 	node->rightnode = NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  502) 	node->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  503) 	node->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  504) 	node->mark = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  505) 	node->index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  506) 	node->offset = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  507) 	node->size = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  508) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  509) 	if (node->parent) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  510) 		bitnum = parent->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  511) 		if ((bitnum & 7) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  512) 			node->bitnum = bitnum + 7 + 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  513) 			node->nextbyte = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  514) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  515) 			node->bitnum = bitnum - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  516) 			node->nextbyte = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  517) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  518) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  519) 		node->bitnum = 7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  520) 		node->nextbyte = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  521) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  522) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  523) 	return node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  525) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  526) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  527)  * Insert a new leaf into the tree, and collapse any subtrees that are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  528)  * fully populated and end in identical leaves. A nextbyte tagged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  529)  * internal node will not be removed to preserve the tree's integrity.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  530)  * Note that due to the structure of utf8, no nextbyte tagged node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  531)  * will be a candidate for removal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  532)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  533) static int insert(struct tree *tree, char *key, int keylen, void *leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  534) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  535) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  536) 	struct node *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  537) 	void **cursor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  538) 	int keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  539) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  540) 	assert(keylen >= 1 && keylen <= 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  541) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  542) 	node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  543) 	cursor = &tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  544) 	keybits = 8 * keylen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  545) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  546) 	/* Insert, creating path along the way. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  547) 	while (keybits) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  548) 		if (!*cursor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  549) 			*cursor = alloc_node(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  550) 		node = *cursor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  551) 		if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  552) 			key++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  553) 		if (*key & (1 << (node->bitnum & 7)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  554) 			cursor = &node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  555) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  556) 			cursor = &node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  557) 		keybits--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  558) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  559) 	*cursor = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  561) 	/* Merge subtrees if possible. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  562) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  563) 		if (*key & (1 << (node->bitnum & 7)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  564) 			node->rightnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  565) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  566) 			node->leftnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  567) 		if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  568) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  569) 		if (node->leftnode == NODE || node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  570) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  571) 		assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  572) 		assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  573) 		/* Compare */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  574) 		if (! tree->leaf_equal(node->left, node->right))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  575) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  576) 		/* Keep left, drop right leaf. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  577) 		leaf = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  578) 		/* Check in parent */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  579) 		parent = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  580) 		if (!parent) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  581) 			/* root of tree! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  582) 			tree->root = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  583) 			tree->childnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  584) 		} else if (parent->left == node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  585) 			parent->left = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  586) 			parent->leftnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  587) 			if (parent->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  588) 				parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  589) 				parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  590) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  591) 				parent->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  592) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  593) 		} else if (parent->right == node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  594) 			parent->right = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  595) 			parent->rightnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  596) 			if (parent->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  597) 				parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  598) 				parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  599) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  600) 				parent->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  601) 				parent->keybits |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  602) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  603) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  604) 			/* internal tree error */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  605) 			assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  606) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  607) 		free(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  608) 		node = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  609) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  610) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  611) 	/* Propagate keymasks up along singleton chains. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  612) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  613) 		parent = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  614) 		if (!parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  615) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  616) 		/* Nix the mask for parents with two children. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  617) 		if (node->keymask == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  618) 			parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  619) 			parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  620) 		} else if (parent->left && parent->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  621) 			parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  622) 			parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  623) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  624) 			assert((parent->keymask & node->keymask) == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  625) 			parent->keymask |= node->keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  626) 			parent->keymask |= (1 << parent->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  627) 			parent->keybits |= node->keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  628) 			if (parent->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  629) 				parent->keybits |= (1 << parent->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  630) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  631) 		node = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  632) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  633) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  634) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  635) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  636) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  637) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  638)  * Prune internal nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  639)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  640)  * Fully populated subtrees that end at the same leaf have already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  641)  * been collapsed.  There are still internal nodes that have for both
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  642)  * their left and right branches a sequence of singletons that make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  643)  * identical choices and end in identical leaves.  The keymask and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  644)  * keybits collected in the nodes describe the choices made in these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  645)  * singleton chains.  When they are identical for the left and right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  646)  * branch of a node, and the two leaves comare identical, the node in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  647)  * question can be removed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  648)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  649)  * Note that nodes with the nextbyte tag set will not be removed by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  650)  * this to ensure tree integrity.  Note as well that the structure of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  651)  * utf8 ensures that these nodes would not have been candidates for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  652)  * removal in any case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  653)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  654) static void prune(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  655) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  656) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  657) 	struct node *left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  658) 	struct node *right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  659) 	struct node *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  660) 	void *leftleaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  661) 	void *rightleaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  662) 	unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  663) 	unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  664) 	unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  665) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  666) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  667) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  668) 		printf("Pruning %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  669) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  670) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  671) 	if (tree->childnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  672) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  673) 	if (!tree->root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  674) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  675) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  676) 	leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  677) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  678) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  679) 		if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  680) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  681) 		if (node->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  682) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  683) 		if (node->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  684) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  685) 		if (!node->left)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  686) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  687) 		if (!node->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  688) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  689) 		left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  690) 		right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  691) 		if (left->keymask == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  692) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  693) 		if (right->keymask == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  694) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  695) 		if (left->keymask != right->keymask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  696) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  697) 		if (left->keybits != right->keybits)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  698) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  699) 		leftleaf = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  700) 		while (!leftleaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  701) 			assert(left->left || left->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  702) 			if (left->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  703) 				leftleaf = left->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  704) 			else if (left->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  705) 				leftleaf = left->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  706) 			else if (left->left)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  707) 				left = left->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  708) 			else if (left->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  709) 				left = left->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  710) 			else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  711) 				assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  712) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  713) 		rightleaf = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  714) 		while (!rightleaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  715) 			assert(right->left || right->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  716) 			if (right->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  717) 				rightleaf = right->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  718) 			else if (right->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  719) 				rightleaf = right->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  720) 			else if (right->left)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  721) 				right = right->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  722) 			else if (right->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  723) 				right = right->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  724) 			else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  725) 				assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  726) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  727) 		if (! tree->leaf_equal(leftleaf, rightleaf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  728) 			goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  729) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  730) 		 * This node has identical singleton-only subtrees.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  731) 		 * Remove it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  732) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  733) 		parent = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  734) 		left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  735) 		right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  736) 		if (parent->left == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  737) 			parent->left = left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  738) 		else if (parent->right == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  739) 			parent->right = left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  740) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  741) 			assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  742) 		left->parent = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  743) 		left->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  744) 		node->left = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  745) 		while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  746) 			bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  747) 			leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  748) 			rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  749) 			if (node->leftnode == NODE && node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  750) 				left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  751) 				free(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  752) 				count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  753) 				node = left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  754) 			} else if (node->rightnode == NODE && node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  755) 				right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  756) 				free(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  757) 				count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  758) 				node = right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  759) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  760) 				node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  761) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  762) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  763) 		/* Propagate keymasks up along singleton chains. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  764) 		node = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  765) 		/* Force re-check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  766) 		bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  767) 		leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  768) 		rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  769) 		for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  770) 			if (node->left && node->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  771) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  772) 			if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  773) 				left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  774) 				node->keymask |= left->keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  775) 				node->keybits |= left->keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  776) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  777) 			if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  778) 				right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  779) 				node->keymask |= right->keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  780) 				node->keybits |= right->keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  781) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  782) 			node->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  783) 			node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  784) 			/* Force re-check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  785) 			bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  786) 			leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  787) 			rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  788) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  789) 	advance:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  790) 		bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  791) 		if ((leftmask & bitmask) == 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  792) 		    node->leftnode == NODE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  793) 		    node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  794) 			leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  795) 			node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  796) 		} else if ((rightmask & bitmask) == 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  797) 			   node->rightnode == NODE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  798) 			   node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  799) 			rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  800) 			node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  801) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  802) 			leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  803) 			rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  804) 			node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  805) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  806) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  807) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  808) 		printf("Pruned %d nodes\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  809) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  810) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  811) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  812)  * Mark the nodes in the tree that lead to leaves that must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  813)  * emitted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  814)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  815) static void mark_nodes(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  816) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  817) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  818) 	struct node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  819) 	unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  820) 	unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  821) 	unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  822) 	int marked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  823) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  824) 	marked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  825) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  826) 		printf("Marking %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  827) 	if (tree->childnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  828) 		goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  830) 	assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  831) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  832) 	leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  833) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  834) 		bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  835) 		if ((leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  836) 			leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  837) 			if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  838) 				assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  839) 				if (tree->leaf_mark(node->left)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  840) 					n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  841) 					while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  842) 						marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  843) 						n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  844) 						n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  845) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  846) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  847) 			} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  848) 				assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  849) 				node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  850) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  851) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  852) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  853) 		if ((rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  854) 			rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  855) 			if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  856) 				assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  857) 				if (tree->leaf_mark(node->right)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  858) 					n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  859) 					while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  860) 						marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  861) 						n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  862) 						n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  863) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  864) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  865) 			} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  866) 				assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  867) 				node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  868) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  869) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  870) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  871) 		leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  872) 		rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  873) 		node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  874) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  875) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  876) 	/* second pass: left siblings and singletons */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  877) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  878) 	assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  879) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  880) 	leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  881) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  882) 		bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  883) 		if ((leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  884) 			leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  885) 			if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  886) 				assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  887) 				if (tree->leaf_mark(node->left)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  888) 					n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  889) 					while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  890) 						marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  891) 						n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  892) 						n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  893) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  894) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  895) 			} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  896) 				assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  897) 				node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  898) 				if (!node->mark && node->parent->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  899) 					marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  900) 					node->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  901) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  902) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  903) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  904) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  905) 		if ((rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  906) 			rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  907) 			if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  908) 				assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  909) 				if (tree->leaf_mark(node->right)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  910) 					n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  911) 					while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  912) 						marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  913) 						n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  914) 						n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  915) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  916) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  917) 			} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  918) 				assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  919) 				node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  920) 				if (!node->mark && node->parent->mark &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  921) 				    !node->parent->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  922) 					marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  923) 					node->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  924) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  925) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  926) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  927) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  928) 		leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  929) 		rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  930) 		node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  931) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  932) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  933) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  934) 		printf("Marked %d nodes\n", marked);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  936) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  937) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  938)  * Compute the index of each node and leaf, which is the offset in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  939)  * emitted trie.  These values must be pre-computed because relative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  940)  * offsets between nodes are used to navigate the tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  941)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  942) static int index_nodes(struct tree *tree, int index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  943) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  944) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  945) 	unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  946) 	unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  947) 	unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  948) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  949) 	int indent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  950) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  951) 	/* Align to a cache line (or half a cache line?). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  952) 	while (index % 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  953) 		index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  954) 	tree->index = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  955) 	indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  956) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  957) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  958) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  959) 		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  960) 	if (tree->childnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  961) 		index += tree->leaf_size(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  962) 		goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  963) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  964) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  965) 	assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  966) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  967) 	leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  968) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  969) 		if (!node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  970) 			goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  971) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  972) 		if (node->index != index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  973) 			node->index = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  974) 		index += node->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  975) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  976) 		while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  977) 			bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  978) 			if (node->mark && (leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  979) 				leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  980) 				if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  981) 					assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  982) 					*tree->leaf_index(tree, node->left) =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  983) 									index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  984) 					index += tree->leaf_size(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  985) 					count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  986) 				} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  987) 					assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  988) 					indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  989) 					node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  990) 					break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  991) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  992) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  993) 			if (node->mark && (rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  994) 				rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  995) 				if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  996) 					assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  997) 					*tree->leaf_index(tree, node->right) = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  998) 					index += tree->leaf_size(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300  999) 					count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) 				} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) 					assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) 					indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) 					node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) 					break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) 			leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) 			rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) 			node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) 			indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) 	/* Round up to a multiple of 16 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) 	while (index % 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) 		index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) 		printf("Final index %d\n", index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) 	return index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023)  * Mark the nodes in a subtree, helper for size_nodes().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) static int mark_subtree(struct node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) 	int changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) 	if (!node || node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) 	node->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) 	node->index = node->parent->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) 	changed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) 	if (node->leftnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) 		changed += mark_subtree(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) 	if (node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) 		changed += mark_subtree(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) 	return changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042)  * Compute the size of nodes and leaves. We start by assuming that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043)  * each node needs to store a three-byte offset. The indexes of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044)  * nodes are calculated based on that, and then this function is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045)  * called to see if the sizes of some nodes can be reduced.  This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046)  * repeated until no more changes are seen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) static int size_nodes(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) 	struct tree *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) 	struct node *right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) 	struct node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) 	unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) 	unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) 	unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) 	unsigned int pathbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) 	unsigned int pathmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) 	unsigned int nbit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) 	int changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) 	int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) 	int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) 	int indent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) 	indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) 	changed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) 	size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) 		printf("Sizing %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) 	if (tree->childnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) 		goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) 	assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) 	pathbits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) 	pathmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) 	leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) 		if (!node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) 			goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) 		offset = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) 		if (!node->left || !node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) 			size = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) 			if (node->rightnode == NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) 				/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) 				 * If the right node is not marked,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) 				 * look for a corresponding node in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) 				 * the next tree.  Such a node need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) 				 * not exist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) 				 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) 				right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) 				next = tree->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) 				while (!right->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) 					assert(next);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) 					n = next->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) 					while (n->bitnum != node->bitnum) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) 						nbit = 1 << n->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) 						if (!(pathmask & nbit))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) 							break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) 						if (pathbits & nbit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) 							if (n->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) 								break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) 							n = n->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) 						} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) 							if (n->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) 								break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) 							n = n->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) 						}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) 					if (n->bitnum != node->bitnum)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) 						break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) 					n = n->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) 					right = n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) 					next = next->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) 				/* Make sure the right node is marked. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) 				if (!right->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) 					changed += mark_subtree(right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) 				offset = right->index - node->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) 				offset = *tree->leaf_index(tree, node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) 				offset -= node->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) 			assert(offset >= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) 			assert(offset <= 0xffffff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) 			if (offset <= 0xff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) 				size = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) 			} else if (offset <= 0xffff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) 				size = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) 			} else { /* offset <= 0xffffff */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) 				size = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) 		if (node->size != size || node->offset != offset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) 			node->size = size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) 			node->offset = offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) 			changed++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) 		while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) 			bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) 			pathmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) 			if (node->mark && (leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) 				leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) 				if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) 					assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) 				} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) 					assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) 					indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) 					node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) 					break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) 			if (node->mark && (rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) 				rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) 				pathbits |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) 				if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) 					assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) 				} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) 					assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) 					indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) 					node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) 					break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) 			leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) 			rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) 			pathmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) 			pathbits &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) 			node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) 			indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) 		printf("Found %d changes\n", changed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) 	return changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183)  * Emit a trie for the given tree into the data array.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) static void emit(struct tree *tree, unsigned char *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) 	struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) 	unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) 	unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) 	unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) 	int offlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) 	int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) 	int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) 	int indent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) 	int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) 	int bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) 	int leaves;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) 	int nodes[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) 	unsigned char byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) 	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) 	leaves = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) 	bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) 	index = tree->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) 	data += index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) 	indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) 		printf("Emitting %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) 	if (tree->childnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) 		assert(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) 		tree->leaf_emit(tree->root, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) 		size = tree->leaf_size(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) 		index += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) 		leaves++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) 		goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) 	assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) 	node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) 	leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) 		if (!node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) 			goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) 		assert(node->offset != -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) 		assert(node->index == index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) 		byte = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) 		if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) 			byte |= NEXTBYTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) 		byte |= (node->bitnum & BITNUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) 		if (node->left && node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) 			if (node->leftnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) 				byte |= LEFTNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) 			if (node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) 				byte |= RIGHTNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) 			if (node->offset <= 0xff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) 				offlen = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) 			else if (node->offset <= 0xffff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) 				offlen = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) 			else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) 				offlen = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) 			nodes[offlen]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) 			offset = node->offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) 			byte |= offlen << OFFLEN_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) 			*data++ = byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) 			index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) 			while (offlen--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) 				*data++ = offset & 0xff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) 				index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) 				offset >>= 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) 		} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) 			if (node->leftnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) 				byte |= TRIENODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) 			nodes[0]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) 			*data++ = byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) 			index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) 		} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) 			byte |= RIGHTNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) 			if (node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) 				byte |= TRIENODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) 			nodes[0]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) 			*data++ = byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) 			index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) 			assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) 		while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) 			bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) 			if (node->mark && (leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) 				leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) 				if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) 					assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) 					data = tree->leaf_emit(node->left,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) 							       data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) 					size = tree->leaf_size(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) 					index += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) 					bytes += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) 					leaves++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) 				} else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) 					assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) 					indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) 					node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) 					break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) 			if (node->mark && (rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) 				rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) 				if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) 					assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) 					data = tree->leaf_emit(node->right,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) 							       data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) 					size = tree->leaf_size(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) 					index += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) 					bytes += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) 					leaves++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) 				} else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) 					assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) 					indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) 					node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) 					break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) 			leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) 			rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) 			node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) 			indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) 	if (verbose > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) 		printf("Emitted %d (%d) leaves",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) 			leaves, bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) 		printf(" %d (%d+%d+%d+%d) nodes",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) 			nodes[0] + nodes[1] + nodes[2] + nodes[3],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) 			nodes[0], nodes[1], nodes[2], nodes[3]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) 		printf(" %d total\n", index - tree->index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325)  * Unicode data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327)  * We need to keep track of the Canonical Combining Class, the Age,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328)  * and decompositions for a code point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330)  * For the Age, we store the index into the ages table.  Effectively
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331)  * this is a generation number that the table maps to a unicode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332)  * version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334)  * The correction field is used to indicate that this entry is in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335)  * corrections array, which contains decompositions that were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336)  * corrected in later revisions.  The value of the correction field is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337)  * the Unicode version in which the mapping was corrected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) struct unicode_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) 	unsigned int code;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) 	int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) 	int gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) 	int correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) 	unsigned int *utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) 	unsigned int *utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) 	char *utf8nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) 	char *utf8nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) struct unicode_data unicode_data[0x110000];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) struct unicode_data *corrections;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) int    corrections_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) struct tree *nfdi_tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) struct tree *nfdicf_tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) struct tree *trees;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) int          trees_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361)  * Check the corrections array to see if this entry was corrected at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362)  * some point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) static struct unicode_data *corrections_lookup(struct unicode_data *u)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) 	for (i = 0; i != corrections_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) 		if (u->code == corrections[i].code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) 			return &corrections[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) 	return u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) static int nfdi_equal(void *l, void *r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) 	struct unicode_data *left = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) 	struct unicode_data *right = r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) 	if (left->gen != right->gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) 	if (left->ccc != right->ccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) 	if (left->utf8nfdi && right->utf8nfdi &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) 	if (left->utf8nfdi || right->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) static int nfdicf_equal(void *l, void *r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) 	struct unicode_data *left = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) 	struct unicode_data *right = r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) 	if (left->gen != right->gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) 	if (left->ccc != right->ccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) 	if (left->utf8nfdicf && right->utf8nfdicf &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) 	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) 	if (left->utf8nfdicf && right->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) 	if (left->utf8nfdicf || right->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) 	if (left->utf8nfdi && right->utf8nfdi &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) 	if (left->utf8nfdi || right->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) 		return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) static void nfdi_print(void *l, int indent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) 		leaf->code, leaf->ccc, leaf->gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) 	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) 	else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) static void nfdicf_print(void *l, int indent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) 		leaf->code, leaf->ccc, leaf->gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) 	if (leaf->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) 		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) 	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) 	else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) static int nfdi_mark(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) 	return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) static int nfdicf_mark(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) 	if (leaf->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) 		return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) static int correction_mark(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) 	return leaf->correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) static int nfdi_size(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) 	int size = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) 	if (HANGUL_SYLLABLE(leaf->code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) 		size += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) 	else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) 		size += strlen(leaf->utf8nfdi) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) 	return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) static int nfdicf_size(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) 	int size = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) 	if (HANGUL_SYLLABLE(leaf->code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) 		size += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) 	else if (leaf->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) 		size += strlen(leaf->utf8nfdicf) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) 	else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) 		size += strlen(leaf->utf8nfdi) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) 	return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) static int *nfdi_index(struct tree *tree, void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) 	return &tree->leafindex[leaf->code];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) static int *nfdicf_index(struct tree *tree, void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) 	return &tree->leafindex[leaf->code];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) static unsigned char *nfdi_emit(void *l, unsigned char *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) 	unsigned char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) 	*data++ = leaf->gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) 	if (HANGUL_SYLLABLE(leaf->code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) 		*data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) 		*data++ = HANGUL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) 	} else if (leaf->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) 		*data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) 		s = (unsigned char*)leaf->utf8nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) 		while ((*data++ = *s++) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) 			;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) 		*data++ = leaf->ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) 	return data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) static unsigned char *nfdicf_emit(void *l, unsigned char *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) 	struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) 	unsigned char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) 	*data++ = leaf->gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) 	if (HANGUL_SYLLABLE(leaf->code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) 		*data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) 		*data++ = HANGUL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) 	} else if (leaf->utf8nfdicf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) 		*data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) 		s = (unsigned char*)leaf->utf8nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) 		while ((*data++ = *s++) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) 			;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) 	} else if (leaf->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) 		*data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) 		s = (unsigned char*)leaf->utf8nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) 		while ((*data++ = *s++) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) 			;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) 	} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) 		*data++ = leaf->ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) 	return data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) static void utf8_create(struct unicode_data *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) 	char utf[18*4+1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) 	char *u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) 	if (data->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) 		assert(data->utf8nfdi[0] == HANGUL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) 		return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) 	u = utf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) 	um = data->utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) 	if (um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) 		for (i = 0; um[i]; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) 			u += utf8encode(u, um[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) 		*u = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) 		data->utf8nfdi = strdup(utf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) 	u = utf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) 	um = data->utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) 	if (um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) 		for (i = 0; um[i]; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) 			u += utf8encode(u, um[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) 		*u = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) 		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) 			data->utf8nfdicf = strdup(utf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) static void utf8_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) 	for (unichar = 0; unichar != 0x110000; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) 		utf8_create(&unicode_data[unichar]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) 	for (i = 0; i != corrections_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) 		utf8_create(&corrections[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) static void trees_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) 	struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) 	unsigned int maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) 	unsigned int nextage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) 	int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) 	/* Count the number of different ages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) 	nextage = (unsigned int)-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) 	do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) 		maxage = nextage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) 		nextage = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) 		for (i = 0; i <= corrections_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) 			data = &corrections[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) 			if (nextage < data->correction &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) 			    data->correction < maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) 				nextage = data->correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) 	} while (nextage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) 	/* Two trees per age: nfdi and nfdicf */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) 	trees_count = count * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) 	trees = calloc(trees_count, sizeof(struct tree));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) 	/* Assign ages to the trees. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) 	count = trees_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) 	nextage = (unsigned int)-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) 	do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) 		maxage = nextage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) 		trees[--count].maxage = maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) 		trees[--count].maxage = maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) 		nextage = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) 		for (i = 0; i <= corrections_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) 			data = &corrections[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) 			if (nextage < data->correction &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) 			    data->correction < maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) 				nextage = data->correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) 	} while (nextage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) 	/* The ages assigned above are off by one. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) 	for (i = 0; i != trees_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) 		j = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) 		while (ages[j] < trees[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) 			j++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) 		trees[i].maxage = ages[j-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) 	/* Set up the forwarding between trees. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) 	trees[trees_count-2].next = &trees[trees_count-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) 	trees[trees_count-1].leaf_mark = nfdi_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) 	trees[trees_count-2].leaf_mark = nfdicf_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) 	for (i = 0; i != trees_count-2; i += 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) 		trees[i].next = &trees[trees_count-2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) 		trees[i].leaf_mark = correction_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) 		trees[i+1].next = &trees[trees_count-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) 		trees[i+1].leaf_mark = correction_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) 	/* Assign the callouts. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) 	for (i = 0; i != trees_count; i += 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) 		trees[i].type = "nfdicf";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) 		trees[i].leaf_equal = nfdicf_equal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) 		trees[i].leaf_print = nfdicf_print;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) 		trees[i].leaf_size = nfdicf_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) 		trees[i].leaf_index = nfdicf_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) 		trees[i].leaf_emit = nfdicf_emit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) 		trees[i+1].type = "nfdi";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) 		trees[i+1].leaf_equal = nfdi_equal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) 		trees[i+1].leaf_print = nfdi_print;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) 		trees[i+1].leaf_size = nfdi_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) 		trees[i+1].leaf_index = nfdi_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) 		trees[i+1].leaf_emit = nfdi_emit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) 	/* Finish init. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) 	for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) 		trees[i].childnode = NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) static void trees_populate(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) 	struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) 	char keyval[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) 	int keylen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) 	for (i = 0; i != trees_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) 		if (verbose > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) 			printf("Populating %s_%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) 				trees[i].type, trees[i].maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) 		for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) 			if (unicode_data[unichar].gen < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) 			keylen = utf8encode(keyval, unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) 			data = corrections_lookup(&unicode_data[unichar]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) 			if (data->correction <= trees[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) 				data = &unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) 			insert(&trees[i], keyval, keylen, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) static void trees_reduce(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) 	int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) 	int changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) 	for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) 		prune(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) 	for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) 		mark_nodes(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) 	do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) 		size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) 		for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) 			size = index_nodes(&trees[i], size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) 		changed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) 		for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) 			changed += size_nodes(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) 	} while (changed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) 	utf8data = calloc(size, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) 	utf8data_size = size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) 	for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) 		emit(&trees[i], utf8data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) 	if (verbose > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) 		for (i = 0; i != trees_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) 			printf("%s_%x idx %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) 				trees[i].type, trees[i].maxage, trees[i].index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) 	nfdi = utf8data + trees[trees_count-1].index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) 	nfdicf = utf8data + trees[trees_count-2].index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) 	nfdi_tree = &trees[trees_count-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) 	nfdicf_tree = &trees[trees_count-2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) static void verify(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) 	struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) 	unsigned int	unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) 	char		key[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) 	int		report;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) 	int		nocf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) 		printf("Verifying %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) 	nocf = strcmp(tree->type, "nfdicf");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) 	for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) 		report = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) 		data = corrections_lookup(&unicode_data[unichar]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) 		if (data->correction <= tree->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) 			data = &unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) 		utf8encode(key,unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) 		leaf = utf8lookup(tree, hangul, key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) 		if (!leaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) 			if (data->gen != -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) 				report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) 			if (unichar < 0xd800 || unichar > 0xdfff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) 				report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) 			if (unichar >= 0xd800 && unichar <= 0xdfff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) 				report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) 			if (data->gen == -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) 				report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) 			if (data->gen != LEAF_GEN(leaf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) 				report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) 			if (LEAF_CCC(leaf) == DECOMPOSE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) 				if (HANGUL_SYLLABLE(data->code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) 					if (data->utf8nfdi[0] != HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) 						report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) 				} else if (nocf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) 					if (!data->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) 						report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) 					} else if (strcmp(data->utf8nfdi,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) 							  LEAF_STR(leaf))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) 						report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) 				} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) 					if (!data->utf8nfdicf &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) 					    !data->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) 						report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) 					} else if (data->utf8nfdicf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) 						if (strcmp(data->utf8nfdicf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) 							   LEAF_STR(leaf)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) 							report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) 					} else if (strcmp(data->utf8nfdi,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) 							  LEAF_STR(leaf))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) 						report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) 					}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) 			} else if (data->ccc != LEAF_CCC(leaf)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) 				report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) 		if (report) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) 			printf("%X code %X gen %d ccc %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) 				" nfdi -> \"%s\"",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) 				unichar, data->code, data->gen,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) 				data->ccc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) 				data->utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) 			if (leaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) 				printf(" gen %d ccc %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) 					" nfdi -> \"%s\"",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) 					LEAF_GEN(leaf),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) 					LEAF_CCC(leaf),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) 					LEAF_CCC(leaf) == DECOMPOSE ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) 						LEAF_STR(leaf) : "");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) 			printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) static void trees_verify(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) 	for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) 		verify(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) static void help(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) 	printf("Usage: %s [options]\n", argv0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) 	printf("This program creates an a data trie used for parsing and\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) 	printf("normalization of UTF-8 strings. The trie is derived from\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) 	printf("a set of input files from the Unicode character database\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) 	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) 	printf("The generated tree supports two normalization forms:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) 	printf("\tnfdi:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) 	printf("\t- Apply unicode normalization form NFD.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) 	printf("\tnfdicf:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) 	printf("\t- Apply unicode normalization form NFD.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) 	printf("\t- Apply a full casefold (C + F).\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) 	printf("These forms were chosen as being most useful when dealing\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) 	printf("with file names: NFD catches most cases where characters\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) 	printf("should be considered equivalent. The ignorables are mostly\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) 	printf("invisible, making names hard to type.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) 	printf("The options to specify the files to be used are listed\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) 	printf("below with their default values, which are the names used\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) 	printf("by version 11.0.0 of the Unicode Character Database.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) 	printf("The input files:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) 	printf("\t-a %s\n", AGE_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) 	printf("\t-c %s\n", CCC_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) 	printf("\t-p %s\n", PROP_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) 	printf("\t-d %s\n", DATA_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) 	printf("\t-f %s\n", FOLD_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) 	printf("\t-n %s\n", NORM_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) 	printf("Additionally, the generated tables are tested using:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) 	printf("\t-t %s\n", TEST_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) 	printf("Finally, the output file:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) 	printf("\t-o %s\n", UTF8_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) static void usage(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) 	help();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) 	exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) static void open_fail(const char *name, int error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) 	printf("Error %d opening %s: %s\n", error, name, strerror(error));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) 	exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) static void file_fail(const char *filename)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) 	printf("Error parsing %s\n", filename);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) 	exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) static void line_fail(const char *filename, const char *line)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) 	printf("Error parsing %s:%s\n", filename, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) 	exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) static void print_utf32(unsigned int *utf32str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) 	int	i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) 	for (i = 0; utf32str[i]; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) 		printf(" %X", utf32str[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) static void print_utf32nfdi(unsigned int unichar)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) 	printf(" %X ->", unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) 	print_utf32(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) static void print_utf32nfdicf(unsigned int unichar)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) 	printf(" %X ->", unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) 	print_utf32(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) 	printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) static void age_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) 	unsigned int first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) 	unsigned int last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) 	unsigned int major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) 	unsigned int minor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) 	unsigned int revision;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) 	int gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) 		printf("Parsing %s\n", age_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) 	file = fopen(age_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) 		open_fail(age_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) 	gen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) 		ret = sscanf(line, "# Age=V%d_%d_%d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) 				&major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) 		if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) 			ages_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) 				printf(" Age V%d_%d_%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) 					major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) 			if (!age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) 				line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) 		if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) 			ages_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) 				printf(" Age V%d_%d\n", major, minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) 			if (!age_valid(major, minor, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) 				line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) 	/* We must have found something above. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) 	if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) 		printf("%d age entries\n", ages_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) 	if (ages_count == 0 || ages_count > MAXGEN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) 		file_fail(age_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) 	/* There is a 0 entry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) 	ages_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) 	ages = calloc(ages_count + 1, sizeof(*ages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) 	/* And a guard entry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) 	ages[ages_count] = (unsigned int)-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) 	rewind(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) 	gen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) 		ret = sscanf(line, "# Age=V%d_%d_%d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) 				&major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) 		if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) 			ages[++gen] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) 				UNICODE_AGE(major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) 				printf(" Age V%d_%d_%d = gen %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) 					major, minor, revision, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) 			if (!age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) 				line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) 		if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) 			ages[++gen] = UNICODE_AGE(major, minor, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) 				printf(" Age V%d_%d = %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) 					major, minor, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) 			if (!age_valid(major, minor, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) 				line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) 		ret = sscanf(line, "%X..%X ; %d.%d #",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) 			     &first, &last, &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) 		if (ret == 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) 			for (unichar = first; unichar <= last; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) 				unicode_data[unichar].gen = gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) 			count += 1 + last - first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) 				printf("  %X..%X gen %d\n", first, last, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) 			if (!utf32valid(first) || !utf32valid(last))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) 				line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) 		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) 		if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) 			unicode_data[unichar].gen = gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) 			count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) 				printf("  %X gen %d\n", unichar, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) 			if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) 				line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) 	unicode_maxage = ages[gen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) 	/* Nix surrogate block */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) 	if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) 		printf(" Removing surrogate block D800..DFFF\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) 	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) 		unicode_data[unichar].gen = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) 	        printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) 	if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) 		file_fail(age_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) static void ccc_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) 	unsigned int first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) 	unsigned int last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) 	unsigned int value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) 		printf("Parsing %s\n", ccc_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) 	file = fopen(ccc_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) 		open_fail(ccc_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) 		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) 		if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) 			for (unichar = first; unichar <= last; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) 				unicode_data[unichar].ccc = value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075)                                 count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) 				printf(" %X..%X ccc %d\n", first, last, value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) 			if (!utf32valid(first) || !utf32valid(last))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) 				line_fail(ccc_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) 		ret = sscanf(line, "%X ; %d #", &unichar, &value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) 		if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) 			unicode_data[unichar].ccc = value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086)                         count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) 				printf(" %X ccc %d\n", unichar, value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) 			if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) 				line_fail(ccc_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) 		printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) 	if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) 		file_fail(ccc_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) static int ignore_compatibility_form(char *type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) 	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) 				 "final", "isolated", "circle", "super",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) 				 "sub", "vertical", "wide", "narrow",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) 				 "small", "square", "fraction", "compat"};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) 	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) 		if (strcmp(type, ignored_types[i]) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) 			return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) static void nfdi_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) 	char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) 	char *type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) 		printf("Parsing %s\n", data_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) 	file = fopen(data_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) 		open_fail(data_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) 		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) 			     &unichar, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) 		if (ret != 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) 		if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) 			line_fail(data_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) 		s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) 		/* skip over <tag> */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) 		if (*s == '<') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) 			type = ++s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) 			while (*++s != '>');
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) 			*s++ = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) 			if(ignore_compatibility_form(type))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) 		/* decode the decomposition into UTF-32 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) 		i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) 		while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) 			mapping[i] = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) 			if (!utf32valid(mapping[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) 				line_fail(data_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) 			i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) 		mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) 		um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) 		memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) 		unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) 		if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) 			print_utf32nfdi(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) 		printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) 	if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) 		file_fail(data_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) static void nfdicf_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) 	char status;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) 	char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) 		printf("Parsing %s\n", fold_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) 	file = fopen(fold_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) 		open_fail(fold_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) 		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) 		if (ret != 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) 		if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) 			line_fail(fold_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) 		/* Use the C+F casefold. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) 		if (status != 'C' && status != 'F')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) 		s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) 		if (*s == '<')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) 			while (*s++ != ' ')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) 				;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) 		i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) 		while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) 			mapping[i] = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) 			if (!utf32valid(mapping[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) 				line_fail(fold_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) 			i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) 		mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) 		um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) 		memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) 		unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) 		if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) 			print_utf32nfdicf(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) 		printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) 	if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) 		file_fail(fold_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) static void ignore_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) 	unsigned int first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) 	unsigned int last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) 		printf("Parsing %s\n", prop_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) 	file = fopen(prop_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) 		open_fail(prop_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) 	assert(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) 		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) 		if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) 			if (!utf32valid(first) || !utf32valid(last))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) 				line_fail(prop_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) 			for (unichar = first; unichar <= last; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) 				free(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) 				um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) 				*um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) 				unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) 				free(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) 				um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) 				*um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) 				unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) 				count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) 				printf(" %X..%X Default_Ignorable_Code_Point\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) 					first, last);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) 		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) 		if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) 				continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) 			if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) 				line_fail(prop_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) 			free(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) 			um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) 			*um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) 			unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) 			free(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) 			um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) 			*um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) 			unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) 			if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) 				printf(" %X Default_Ignorable_Code_Point\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) 					unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) 			count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) 		printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) 	if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) 		file_fail(prop_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) static void corrections_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) 	unsigned int major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) 	unsigned int minor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) 	unsigned int revision;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) 	unsigned int age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) 	char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) 		printf("Parsing %s\n", norm_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) 	file = fopen(norm_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) 		open_fail(norm_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) 				&unichar, buf0, buf1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) 				&major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) 		if (ret != 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) 			line_fail(norm_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) 	corrections = calloc(count, sizeof(struct unicode_data));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) 	corrections_count = count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) 	rewind(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) 				&unichar, buf0, buf1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) 				&major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) 		if (ret != 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) 			line_fail(norm_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) 		corrections[count] = unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) 		assert(corrections[count].code == unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) 		age = UNICODE_AGE(major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) 		corrections[count].correction = age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) 		i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) 		s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) 		while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) 			mapping[i] = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) 			if (!utf32valid(mapping[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) 				line_fail(norm_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) 			i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) 		mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) 		um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) 		memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) 		corrections[count].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) 		if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) 			printf(" %X -> %s -> %s V%d_%d_%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) 				unichar, buf0, buf1, major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) 	        printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) 	if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) 		file_fail(norm_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382)  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384)  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385)  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387)  * SBase = 0xAC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388)  * LBase = 0x1100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389)  * VBase = 0x1161
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390)  * TBase = 0x11A7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391)  * LCount = 19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392)  * VCount = 21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393)  * TCount = 28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394)  * NCount = 588 (VCount * TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395)  * SCount = 11172 (LCount * NCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397)  * Decomposition:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398)  *   SIndex = s - SBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400)  * LV (Canonical/Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401)  *   LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402)  *   VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403)  *   LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404)  *   VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406)  * LVT (Canonical)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407)  *   LVIndex = (SIndex / TCount) * TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408)  *   TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409)  *   LVPart = SBase + LVIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410)  *   TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412)  * LVT (Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413)  *   LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414)  *   VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415)  *   TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416)  *   LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417)  *   VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418)  *   if (TIndex == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419)  *          d = <LPart, VPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420)  *   } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421)  *          TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422)  *          d = <LPart, VPart, TPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423)  *   }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) static void hangul_decompose(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) 	unsigned int sb = 0xAC00;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) 	unsigned int lb = 0x1100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) 	unsigned int vb = 0x1161;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) 	unsigned int tb = 0x11a7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) 	/* unsigned int lc = 19; */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) 	unsigned int vc = 21;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) 	unsigned int tc = 28;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) 	unsigned int nc = (vc * tc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) 	/* unsigned int sc = (lc * nc); */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) 	unsigned int mapping[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441)         int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) 		printf("Decomposing hangul\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) 	/* Hangul */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) 	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) 		unsigned int si = unichar - sb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) 		unsigned int li = si / nc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) 		unsigned int vi = (si % nc) / tc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) 		unsigned int ti = si % tc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) 		i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) 		mapping[i++] = lb + li;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) 		mapping[i++] = vb + vi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) 		if (ti)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) 			mapping[i++] = tb + ti;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) 		mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) 		assert(!unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) 		um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) 		memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) 		unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) 		assert(!unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) 		um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) 		memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) 		unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) 		 * Add a cookie as a reminder that the hangul syllable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) 		 * decompositions must not be stored in the generated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) 		 * trie.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) 		unicode_data[unichar].utf8nfdi = malloc(2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) 		unicode_data[unichar].utf8nfdi[0] = HANGUL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) 		unicode_data[unichar].utf8nfdi[1] = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) 		if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) 			print_utf32nfdi(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) 		printf("Created %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) static void nfdi_decompose(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) 	unsigned int *dc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) 	int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) 		printf("Decomposing nfdi\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) 	for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) 		if (!unicode_data[unichar].utf32nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) 		for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) 			ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) 			i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) 			um = unicode_data[unichar].utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) 			while (*um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) 				dc = unicode_data[*um].utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) 				if (dc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) 					for (j = 0; dc[j]; j++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) 						mapping[i++] = dc[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) 					ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) 				} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) 					mapping[i++] = *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) 				um++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) 			mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) 			if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) 			free(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) 			um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) 			memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) 			unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) 		/* Add this decomposition to nfdicf if there is no entry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) 		if (!unicode_data[unichar].utf32nfdicf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) 			um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) 			memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) 			unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) 		if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) 			print_utf32nfdi(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) 		printf("Processed %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) static void nfdicf_decompose(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) 	unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) 	unsigned int *dc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) 	int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) 	int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) 		printf("Decomposing nfdicf\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) 	count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) 	for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) 		if (!unicode_data[unichar].utf32nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) 		for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) 			ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) 			i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) 			um = unicode_data[unichar].utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) 			while (*um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) 				dc = unicode_data[*um].utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) 				if (dc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) 					for (j = 0; dc[j]; j++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) 						mapping[i++] = dc[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) 					ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) 				} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) 					mapping[i++] = *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) 				um++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) 			mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) 			if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) 				break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) 			free(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) 			um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) 			memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) 			unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) 		if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) 			print_utf32nfdicf(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) 		count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) 		printf("Processed %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) int utf8agemax(struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) int utf8nagemax(struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) int utf8agemin(struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) int utf8nagemin(struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) ssize_t utf8len(struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) ssize_t utf8nlen(struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) struct utf8cursor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) int utf8cursor(struct utf8cursor *, struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) int utf8byte(struct utf8cursor *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606)  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608)  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609)  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611)  * SBase = 0xAC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612)  * LBase = 0x1100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613)  * VBase = 0x1161
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614)  * TBase = 0x11A7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615)  * LCount = 19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616)  * VCount = 21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617)  * TCount = 28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618)  * NCount = 588 (VCount * TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619)  * SCount = 11172 (LCount * NCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621)  * Decomposition:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622)  *   SIndex = s - SBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624)  * LV (Canonical/Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625)  *   LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626)  *   VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627)  *   LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628)  *   VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630)  * LVT (Canonical)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631)  *   LVIndex = (SIndex / TCount) * TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632)  *   TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633)  *   LVPart = SBase + LVIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634)  *   TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636)  * LVT (Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637)  *   LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638)  *   VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639)  *   TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640)  *   LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641)  *   VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642)  *   if (TIndex == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643)  *          d = <LPart, VPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644)  *   } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645)  *          TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646)  *          d = <LPart, VPart, TPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647)  *   }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) /* Constants */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) #define SB	(0xAC00)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) #define LB	(0x1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) #define VB	(0x1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) #define TB	(0x11A7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) #define LC	(19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) #define VC	(21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) #define TC	(28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) #define NC	(VC * TC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) #define SC	(LC * NC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) /* Algorithmic decomposition of hangul syllable. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) 	unsigned int	si;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) 	unsigned int	li;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) 	unsigned int	vi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) 	unsigned int	ti;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) 	unsigned char	*h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) 	/* Calculate the SI, LI, VI, and TI values. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) 	si = utf8decode(str) - SB;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) 	li = si / NC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) 	vi = (si % NC) / TC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) 	ti = si % TC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) 	/* Fill in base of leaf. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) 	h = hangul;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) 	LEAF_GEN(h) = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) 	LEAF_CCC(h) = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) 	h += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) 	/* Add LPart, a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) 	h += utf8encode((char *)h, li + LB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) 	/* Add VPart, a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) 	h += utf8encode((char *)h, vi + VB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) 	if (ti)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) 		h += utf8encode((char *)h, ti + TB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) 	/* Terminate string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) 	h[0] = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) 	return hangul;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699)  * Use trie to scan s, touching at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700)  * Returns the leaf if one exists, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702)  * A non-NULL return guarantees that the UTF-8 sequence starting at s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703)  * is well-formed and corresponds to a known unicode code point.  The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704)  * shorthand for this will be "is valid UTF-8 unicode".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) 			       const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) 	utf8trie_t	*trie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) 	int		offlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) 	int		offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) 	int		mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) 	int		node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) 	if (len == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) 		return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) 	node = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) 	trie = utf8data + tree->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) 	while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) 		if (*trie & NEXTBYTE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) 			if (--len == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) 				return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) 			s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) 		mask = 1 << (*trie & BITNUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) 		if (*s & mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) 			/* Right leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) 			if (offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) 				/* Right node at offset of trie */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) 				node = (*trie & RIGHTNODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) 				offset = trie[offlen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) 				while (--offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) 					offset <<= 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) 					offset |= trie[offlen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) 				}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) 				trie += offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) 			} else if (*trie & RIGHTPATH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) 				/* Right node after this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) 				node = (*trie & TRIENODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) 				trie++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) 				/* No right node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) 				return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) 			/* Left leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) 			if (offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) 				/* Left node after this node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) 				node = (*trie & LEFTNODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) 				trie += offlen + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) 			} else if (*trie & RIGHTPATH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) 				/* No left node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) 				return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) 			} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) 				/* Left node after this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) 				node = (*trie & TRIENODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) 				trie++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) 	/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) 	 * Hangul decomposition is done algorithmically. These are the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) 	 * always 3 bytes long, so s has been advanced twice, and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) 	 * start of the sequence is at s-2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) 	 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) 		trie = utf8hangul(s - 2, hangul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) 	return trie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776)  * Use trie to scan s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777)  * Returns the leaf if one exists, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779)  * Forwards to trie_nlookup().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) 			      const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) 	return utf8nlookup(tree, hangul, s, (size_t)-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788)  * Return the number of bytes used by the current UTF-8 sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789)  * Assumes the input points to the first byte of a valid UTF-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790)  * sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) static inline int utf8clen(const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) 	unsigned char c = *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799)  * Maximum age of any character in s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800)  * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801)  * Return 0 if only non-assigned code points are used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) int utf8agemax(struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) 	int		age = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) 	int		leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) 	while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) 		leaf = utf8lookup(tree, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) 		leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) 		if (leaf_age <= tree->maxage && leaf_age > age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) 			age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) 		s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) 	return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826)  * Minimum age of any character in s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827)  * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828)  * Return 0 if non-assigned code points are used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) int utf8agemin(struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) 	int		age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) 	int		leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) 	age = tree->maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) 	while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) 		leaf = utf8lookup(tree, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) 		leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) 		if (leaf_age <= tree->maxage && leaf_age < age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) 			age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) 		s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) 	return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853)  * Maximum age of any character in s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854)  * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) int utf8nagemax(struct tree *tree, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) 	int		age = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) 	int		leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866)         while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) 		leaf = utf8nlookup(tree, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) 		leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) 		if (leaf_age <= tree->maxage && leaf_age > age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) 			age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) 		len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) 		s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) 	return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880)  * Maximum age of any character in s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881)  * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) int utf8nagemin(struct tree *tree, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) 	int		leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) 	int		age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) 	age = tree->maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893)         while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) 		leaf = utf8nlookup(tree, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) 		leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) 		if (leaf_age <= tree->maxage && leaf_age < age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) 			age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) 		len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) 		s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) 	return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907)  * Length of the normalization of s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908)  * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910)  * A string of Default_Ignorable_Code_Point has length 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) ssize_t utf8len(struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) 	size_t		ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) 	while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) 		leaf = utf8lookup(tree, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) 			ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) 		else if (LEAF_CCC(leaf) == DECOMPOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) 			ret += strlen(LEAF_STR(leaf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) 			ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) 		s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936)  * Length of the normalization of s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937)  * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) 	utf8leaf_t	*leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) 	size_t		ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) 	while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) 		leaf = utf8nlookup(tree, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) 			ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) 		else if (LEAF_CCC(leaf) == DECOMPOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) 			ret += strlen(LEAF_STR(leaf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) 		else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) 			ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) 		len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) 		s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) 	return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964)  * Cursor structure used by the normalizer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) struct utf8cursor {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) 	struct tree	*tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) 	const char	*s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) 	const char	*p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) 	const char	*ss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) 	const char	*sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) 	unsigned int	len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) 	unsigned int	slen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) 	short int	ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) 	short int	nccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) 	unsigned int	unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) 	unsigned char	hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981)  * Set up an utf8cursor for use by utf8byte().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983)  *   s      : string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984)  *   len    : length of s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985)  *   u8c    : pointer to cursor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986)  *   trie   : utf8trie_t to use for normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988)  * Returns -1 on error, 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) 		size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) 	if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) 	if (!s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) 	u8c->tree = tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) 	u8c->s = s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) 	u8c->p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) 	u8c->ss = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) 	u8c->sp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) 	u8c->len = len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) 	u8c->slen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) 	u8c->ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) 	u8c->nccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) 	u8c->unichar = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) 	/* Check we didn't clobber the maximum length. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) 	if (u8c->len != len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) 	/* The first byte of s may not be an utf8 continuation. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) 	if (len > 0 && (*s & 0xC0) == 0x80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017)  * Set up an utf8cursor for use by utf8byte().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019)  *   s      : NUL-terminated string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020)  *   u8c    : pointer to cursor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021)  *   trie   : utf8trie_t to use for normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023)  * Returns -1 on error, 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) 	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031)  * Get one byte from the normalized form of the string described by u8c.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033)  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035)  * The cursor keeps track of the location in the string in u8c->s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036)  * When a character is decomposed, the current location is stored in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037)  * u8c->p, and u8c->s is set to the start of the decomposition. Note
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038)  * that bytes from a decomposition do not count against u8c->len.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040)  * Characters are emitted if they match the current CCC in u8c->ccc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041)  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042)  * and the function returns 0 in that case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044)  * Sorting by CCC is done by repeatedly scanning the string.  The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045)  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046)  * the start of the scan.  The first pass finds the lowest CCC to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047)  * emitted and stores it in u8c->nccc, the second pass emits the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048)  * characters with this CCC and finds the next lowest CCC. This limits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049)  * the number of passes to 1 + the number of different CCCs in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050)  * sequence being scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051)  *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052)  * Therefore:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053)  *  u8c->p  != NULL -> a decomposition is being scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054)  *  u8c->ss != NULL -> this is a repeating scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055)  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056)  */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) int utf8byte(struct utf8cursor *u8c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) 	utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) 	int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) 	for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) 		/* Check for the end of a decomposed character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) 		if (u8c->p && *u8c->s == '\0') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) 			u8c->s = u8c->p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) 			u8c->p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) 		/* Check for end-of-string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) 			/* There is no next byte. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) 			if (u8c->ccc == STOPPER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) 				return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) 			/* End-of-string during a scan counts as a stopper. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) 			ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) 			goto ccc_mismatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) 		} else if ((*u8c->s & 0xC0) == 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) 			/* This is a continuation of the current character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) 			if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) 				u8c->len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) 			return (unsigned char)*u8c->s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) 		/* Look up the data for the current character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) 		if (u8c->p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) 			leaf = utf8nlookup(u8c->tree, u8c->hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) 					   u8c->s, u8c->len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) 		/* No leaf found implies that the input is a binary blob. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) 		if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) 		/* Characters that are too new have CCC 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) 		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) 			ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) 		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) 			u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) 			u8c->p = u8c->s + utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) 			u8c->s = LEAF_STR(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) 			/* Empty decomposition implies CCC 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) 			if (*u8c->s == '\0') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) 				if (u8c->ccc == STOPPER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) 					continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) 				ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) 				goto ccc_mismatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) 			}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) 			ccc = LEAF_CCC(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) 		u8c->unichar = utf8decode(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) 		 * If this is not a stopper, then see if it updates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) 		 * the next canonical class to be emitted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) 			u8c->nccc = ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) 		/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) 		 * Return the current byte if this is the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) 		 * combining class.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) 		 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) 		if (ccc == u8c->ccc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) 			if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) 				u8c->len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) 			return (unsigned char)*u8c->s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) 		/* Current combining class mismatch. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) 	ccc_mismatch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) 		if (u8c->nccc == STOPPER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) 			/*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) 			 * Scan forward for the first canonical class
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) 			 * to be emitted.  Save the position from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) 			 * which to restart.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) 			 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) 			assert(u8c->ccc == STOPPER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) 			u8c->ccc = MINCCC - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) 			u8c->nccc = ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) 			u8c->sp = u8c->p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) 			u8c->ss = u8c->s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) 			u8c->slen = u8c->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) 			if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) 				u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) 			u8c->s += utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) 		} else if (ccc != STOPPER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) 			/* Not a stopper, and not the ccc we're emitting. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) 			if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) 				u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) 			u8c->s += utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) 		} else if (u8c->nccc != MAXCCC + 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) 			/* At a stopper, restart for next ccc. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) 			u8c->ccc = u8c->nccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) 			u8c->nccc = MAXCCC + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) 			u8c->s = u8c->ss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) 			u8c->p = u8c->sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) 			u8c->len = u8c->slen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) 		} else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) 			/* All done, proceed from here. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) 			u8c->ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) 			u8c->nccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) 			u8c->sp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) 			u8c->ss = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) 			u8c->slen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) static int normalize_line(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) 	char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) 	char *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) 	int c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) 	struct utf8cursor u8c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) 	/* First test: null-terminated string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) 	s = buf2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) 	t = buf3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) 	if (utf8cursor(&u8c, tree, s))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) 	while ((c = utf8byte(&u8c)) > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) 		if (c != (unsigned char)*t++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) 	if (c < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) 	if (*t != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) 	/* Second test: length-limited string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) 	s = buf2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) 	/* Replace NUL with a value that will cause an error if seen. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) 	s[strlen(s) + 1] = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) 	t = buf3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) 	if (utf8cursor(&u8c, tree, s))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) 	while ((c = utf8byte(&u8c)) > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) 		if (c != (unsigned char)*t++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) 			return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) 	if (c < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) 	if (*t != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) 		return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) static void normalization_test(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) 	struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) 	char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) 	char *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) 	int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) 	int ignorables;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) 	int tests = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) 	int failures = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) 		printf("Parsing %s\n", test_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) 	/* Step one, read data from file. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) 	file = fopen(test_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) 		open_fail(test_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) 	while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) 		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) 			     buf0, buf1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) 		if (ret != 2 || *line == '#')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) 			continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) 		s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) 		t = buf2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) 		while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) 			unichar = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) 			t += utf8encode(t, unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) 		*t = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) 		ignorables = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) 		s = buf1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) 		t = buf3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) 		while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) 			unichar = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) 			data = &unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) 			if (data->utf8nfdi && !*data->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) 				ignorables = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) 			else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) 				t += utf8encode(t, unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) 		*t = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) 		tests++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) 		if (normalize_line(nfdi_tree) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) 			printf("Line %s -> %s", buf0, buf1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) 			if (ignorables)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) 				printf(" (ignorables removed)");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) 			printf(" failure\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) 			failures++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) 		printf("Ran %d tests with %d failures\n", tests, failures);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) 	if (failures)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) 		file_fail(test_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) static void write_file(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) 	FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) 	int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) 	int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) 	int t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) 	int gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) 	if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) 		printf("Writing %s\n", utf8_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) 	file = fopen(utf8_name, "w");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) 	if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) 		open_fail(utf8_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) 	fprintf(file, "/* This file is generated code, do not edit. */\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) 	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) 	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) 	fprintf(file, "#endif\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) 	fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) 	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) 		unicode_maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) 	fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) 	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) 	for (i = 0; i != ages_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) 		fprintf(file, "\t%#x%s\n", ages[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) 			ages[i] == unicode_maxage ? "" : ",");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) 	fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) 	fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) 	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) 	t = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) 	for (gen = 0; gen < ages_count; gen++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) 		fprintf(file, "\t{ %#x, %d }%s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) 			ages[gen], trees[t].index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) 			ages[gen] == unicode_maxage ? "" : ",");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) 		if (trees[t].maxage == ages[gen])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) 			t += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) 	fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) 	fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) 	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) 	t = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) 	for (gen = 0; gen < ages_count; gen++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) 		fprintf(file, "\t{ %#x, %d }%s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) 			ages[gen], trees[t].index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) 			ages[gen] == unicode_maxage ? "" : ",");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) 		if (trees[t].maxage == ages[gen])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) 			t += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) 	fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) 	fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) 	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) 		utf8data_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) 	t = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) 	for (i = 0; i != utf8data_size; i += 16) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) 		if (i == trees[t].index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) 			fprintf(file, "\t/* %s_%x */\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) 				trees[t].type, trees[t].maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) 			if (t < trees_count-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) 				t++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) 		fprintf(file, "\t");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) 		for (j = i; j != i + 16; j++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) 			fprintf(file, "0x%.2x%s", utf8data[j],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) 				(j < utf8data_size -1 ? "," : ""));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) 		fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) 	fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) 	fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) int main(int argc, char *argv[])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) 	unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) 	int opt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) 	argv0 = argv[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) 	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) 		switch (opt) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) 		case 'a':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) 			age_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) 		case 'c':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) 			ccc_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) 		case 'd':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) 			data_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) 		case 'f':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) 			fold_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) 		case 'n':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) 			norm_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) 		case 'o':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) 			utf8_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) 		case 'p':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) 			prop_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) 		case 't':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) 			test_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) 		case 'v':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) 			verbose++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) 			break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) 		case 'h':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) 			help();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) 			exit(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) 		default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) 			usage();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) 		}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) 	}
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) 	if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) 		help();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) 	for (unichar = 0; unichar != 0x110000; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) 		unicode_data[unichar].code = unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) 	age_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) 	ccc_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) 	nfdi_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) 	nfdicf_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) 	ignore_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) 	corrections_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) 	hangul_decompose();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) 	nfdi_decompose();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) 	nfdicf_decompose();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) 	utf8_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) 	trees_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) 	trees_populate();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) 	trees_reduce();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) 	trees_verify();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) 	/* Prevent "unused function" warning. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) 	(void)lookup(nfdi_tree, " ");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) 	if (verbose > 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) 		tree_walk(nfdi_tree);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) 	if (verbose > 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) 		tree_walk(nfdicf_tree);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) 	normalization_test();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) 	write_file();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417) 
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) 	return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) }