^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) * Copyright (c) 2014 SGI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * This program is free software; you can redistribute it and/or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) * modify it under the terms of the GNU General Public License as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * published by the Free Software Foundation.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) * This program is distributed in the hope that it would be useful,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) * but WITHOUT ANY WARRANTY; without even the implied warranty of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) * GNU General Public License for more details.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) * You should have received a copy of the GNU General Public License
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) * along with this program; if not, write the Free Software Foundation,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) /* Generator for a compact trie for unicode normalization */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) #include <sys/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) #include <stddef.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) #include <stdlib.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) #include <stdio.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) #include <assert.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) #include <string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) #include <unistd.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) #include <errno.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) /* Default names of the in- and output files. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) #define AGE_NAME "DerivedAge.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) #define CCC_NAME "DerivedCombiningClass.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) #define PROP_NAME "DerivedCoreProperties.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) #define DATA_NAME "UnicodeData.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) #define FOLD_NAME "CaseFolding.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) #define NORM_NAME "NormalizationCorrections.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) #define TEST_NAME "NormalizationTest.txt"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) #define UTF8_NAME "utf8data.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) const char *age_name = AGE_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) const char *ccc_name = CCC_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) const char *prop_name = PROP_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) const char *data_name = DATA_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) const char *fold_name = FOLD_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) const char *norm_name = NORM_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) const char *test_name = TEST_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) const char *utf8_name = UTF8_NAME;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) int verbose = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) /* An arbitrary line size limit on input lines. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) #define LINESIZE 1024
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) char line[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) char buf0[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) char buf1[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) char buf2[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) char buf3[LINESIZE];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) const char *argv0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * Unicode version numbers consist of three parts: major, minor, and a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * revision. These numbers are packed into an unsigned int to obtain
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * a single version number.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) * To save space in the generated trie, the unicode version is not
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * stored directly, instead we calculate a generation number from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) * unicode versions seen in the DerivedAge file, and use that as an
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * index into a table of unicode versions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) #define UNICODE_MAJ_SHIFT (16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) #define UNICODE_MIN_SHIFT (8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) #define UNICODE_MAJ_MAX ((unsigned short)-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) #define UNICODE_MIN_MAX ((unsigned char)-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) #define UNICODE_REV_MAX ((unsigned char)-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) #define UNICODE_AGE(MAJ,MIN,REV) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) ((unsigned int)(REV)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) unsigned int *ages;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) int ages_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) unsigned int unicode_maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) static int age_valid(unsigned int major, unsigned int minor,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) unsigned int revision)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) if (major > UNICODE_MAJ_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) if (minor > UNICODE_MIN_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) if (revision > UNICODE_REV_MAX)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) * utf8trie_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) * A compact binary tree, used to decode UTF-8 characters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) * Internal nodes are one byte for the node itself, and up to three
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) * bytes for an offset into the tree. The first byte contains the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) * following information:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * NEXTBYTE - flag - advance to next byte if set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * BITNUM - 3 bit field - the bit number to tested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * OFFLEN - 2 bit field - number of bytes in the offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) * if offlen == 0 (non-branching node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * RIGHTPATH - 1 bit field - set if the following node is for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * right-hand path (tested bit is set)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) * TRIENODE - 1 bit field - set if the following node is an internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) * node, otherwise it is a leaf node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) * if offlen != 0 (branching node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) * LEFTNODE - 1 bit field - set if the left-hand node is internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) * RIGHTNODE - 1 bit field - set if the right-hand node is internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) * Due to the way utf8 works, there cannot be branching nodes with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) * NEXTBYTE set, and moreover those nodes always have a righthand
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) * descendant.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) typedef unsigned char utf8trie_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) #define BITNUM 0x07
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) #define NEXTBYTE 0x08
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) #define OFFLEN 0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) #define OFFLEN_SHIFT 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) #define RIGHTPATH 0x40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) #define TRIENODE 0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) #define RIGHTNODE 0x40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) #define LEFTNODE 0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * utf8leaf_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) * The leaves of the trie are embedded in the trie, and so the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) * underlying datatype, unsigned char.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * leaf[0]: The unicode version, stored as a generation number that is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) * an index into utf8agetab[]. With this we can filter code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * points based on the unicode version in which they were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * defined. The CCC of a non-defined code point is 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * leaf[1]: Canonical Combining Class. During normalization, we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) * to do a stable sort into ascending order of all characters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) * with a non-zero CCC that occur between two characters with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) * a CCC of 0, or at the begin or end of a string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) * The unicode standard guarantees that all CCC values are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) * between 0 and 254 inclusive, which leaves 255 available as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * a special value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) * Code points with CCC 0 are known as stoppers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * start of a NUL-terminated string that is the decomposition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) * of the character.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) * The CCC of a decomposable character is the same as the CCC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) * of the first character of its decomposition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) * Some characters decompose as the empty string: these are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * characters with the Default_Ignorable_Code_Point property.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) * These do affect normalization, as they all have CCC 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * The decompositions in the trie have been fully expanded.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * Casefolding, if applicable, is also done using decompositions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) typedef unsigned char utf8leaf_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) #define LEAF_GEN(LEAF) ((LEAF)[0])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) #define LEAF_CCC(LEAF) ((LEAF)[1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) #define MAXGEN (255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) #define MINCCC (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) #define MAXCCC (254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) #define STOPPER (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) #define DECOMPOSE (255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) #define HANGUL ((char)(255))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) #define UTF8HANGULLEAF (12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) struct tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) unsigned char *utf8data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) size_t utf8data_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) utf8trie_t *nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) utf8trie_t *nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * UTF8 valid ranges.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * The UTF-8 encoding spreads the bits of a 32bit word over several
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) * bytes. This table gives the ranges that can be held and how they'd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) * be represented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) * 0x00000000 0x0000007F: 0xxxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) * There is an additional requirement on UTF-8, in that only the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) * shortest representation of a 32bit value is to be used. A decoder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * must not decode sequences that do not satisfy this requirement.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) * Thus the allowed ranges have a lower bound.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * 0x00000000 0x0000007F: 0xxxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * 17 planes of 65536 values. This limits the sequences actually seen
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) * even more, to just the following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) * 0 - 0x7f: 0 0x7f
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) * Even within those ranges not all values are allowed: the surrogates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) * 0xd800 - 0xdfff should never be seen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) * Note that the longest sequence seen with valid usage is 4 bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) * the same a single UTF-32 character. This makes the UTF-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * representation of Unicode strictly smaller than UTF-32.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * The shortest sequence requirement was introduced by:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * Corrigendum #1: UTF-8 Shortest Form
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * It can be found here:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) * http://www.unicode.org/versions/corrigendum1.html
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) #define UTF8_2_BITS 0xC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) #define UTF8_3_BITS 0xE0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) #define UTF8_4_BITS 0xF0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) #define UTF8_N_BITS 0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) #define UTF8_2_MASK 0xE0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) #define UTF8_3_MASK 0xF0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) #define UTF8_4_MASK 0xF8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) #define UTF8_N_MASK 0xC0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) #define UTF8_V_MASK 0x3F
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) #define UTF8_V_SHIFT 6
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) static int utf8encode(char *str, unsigned int val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) int len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) if (val < 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) len = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) } else if (val < 0x800) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) str[1] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) str[1] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) str[0] |= UTF8_2_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) len = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) } else if (val < 0x10000) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) str[2] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) str[2] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) str[1] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) str[1] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) str[0] |= UTF8_3_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) len = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) } else if (val < 0x110000) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) str[3] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) str[3] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) str[2] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) str[2] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) str[1] = val & UTF8_V_MASK;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) str[1] |= UTF8_N_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) val >>= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) str[0] = val;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) str[0] |= UTF8_4_BITS;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) len = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) printf("%#x: illegal val\n", val);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) len = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) return len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) static unsigned int utf8decode(const char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) const unsigned char *s = (const unsigned char*)str;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) unsigned int unichar = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) if (*s < 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) unichar = *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) } else if (*s < UTF8_3_BITS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) unichar = *s++ & 0x1F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) unichar |= *s & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) } else if (*s < UTF8_4_BITS) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) unichar = *s++ & 0x0F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) unichar |= *s++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) unichar |= *s & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) unichar = *s++ & 0x0F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) unichar |= *s++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) unichar |= *s++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) unichar <<= UTF8_V_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) unichar |= *s & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) return unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) static int utf32valid(unsigned int unichar)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) return unichar < 0x110000;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) #define NODE 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) #define LEAF 0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) struct tree {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) void *root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) int childnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) const char *type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) unsigned int maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) struct tree *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) int (*leaf_equal)(void *, void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) void (*leaf_print)(void *, int);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) int (*leaf_mark)(void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) int (*leaf_size)(void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) int *(*leaf_index)(struct tree *, void *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) unsigned char *(*leaf_emit)(void *, unsigned char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) int leafindex[0x110000];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) struct node {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) int mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) struct node *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) void *left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) void *right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) unsigned char bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) unsigned char nextbyte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) unsigned char leftnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) unsigned char rightnode;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) unsigned int keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) unsigned int keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) * Example lookup function for a tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) static void *lookup(struct tree *tree, const char *key)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) void *leaf = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) while (!leaf && node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) key++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) if (*key & (1 << (node->bitnum & 7))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) /* Right leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) if (node->rightnode == NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) } else if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) leaf = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) /* Left leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) if (node->leftnode == NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) } else if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) leaf = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) return leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * A simple non-recursive tree walker: keep track of visits to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) * left and right branches in the leftmask and rightmask.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) static void tree_walk(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) int indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) int nodes, singletons, leaves;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) nodes = singletons = leaves = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) if (tree->childnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) assert(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) tree->leaf_print(tree->root, indent);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) leaves = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) printf("%*snode @ %p bitnum %d nextbyte %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) " left %p right %p mask %x bits %x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) indent, "", node,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) node->bitnum, node->nextbyte,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) node->left, node->right,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) node->keymask, node->keybits);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) nodes += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) if (!(node->left && node->right))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) singletons += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) if ((leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) tree->leaf_print(node->left,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) indent+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) leaves += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) if ((rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) tree->leaf_print(node->right,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) indent+1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) leaves += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) printf("nodes %d leaves %d singletons %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) nodes, leaves, singletons);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) * Allocate an initialize a new internal node.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) static struct node *alloc_node(struct node *parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) int bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) node = malloc(sizeof(*node));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) node->left = node->right = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) node->parent = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) node->leftnode = NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) node->rightnode = NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) node->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) node->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) node->mark = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) node->index = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) node->offset = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) node->size = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) if (node->parent) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) bitnum = parent->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) if ((bitnum & 7) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) node->bitnum = bitnum + 7 + 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) node->nextbyte = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) node->bitnum = bitnum - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) node->nextbyte = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) node->bitnum = 7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) node->nextbyte = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) return node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * Insert a new leaf into the tree, and collapse any subtrees that are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) * fully populated and end in identical leaves. A nextbyte tagged
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) * internal node will not be removed to preserve the tree's integrity.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) * Note that due to the structure of utf8, no nextbyte tagged node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) * will be a candidate for removal.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) static int insert(struct tree *tree, char *key, int keylen, void *leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) struct node *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) void **cursor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) int keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) assert(keylen >= 1 && keylen <= 4);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) cursor = &tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) keybits = 8 * keylen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) /* Insert, creating path along the way. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) while (keybits) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) if (!*cursor)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) *cursor = alloc_node(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) node = *cursor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) key++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) if (*key & (1 << (node->bitnum & 7)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) cursor = &node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) cursor = &node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) keybits--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) *cursor = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) /* Merge subtrees if possible. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) if (*key & (1 << (node->bitnum & 7)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) node->rightnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) node->leftnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) if (node->leftnode == NODE || node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) /* Compare */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) if (! tree->leaf_equal(node->left, node->right))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) /* Keep left, drop right leaf. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) leaf = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) /* Check in parent */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) parent = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) if (!parent) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) /* root of tree! */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) tree->root = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) tree->childnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) } else if (parent->left == node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) parent->left = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) parent->leftnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) if (parent->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) parent->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) } else if (parent->right == node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) parent->right = leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) parent->rightnode = LEAF;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) if (parent->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) parent->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) parent->keybits |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* internal tree error */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) free(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) node = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) /* Propagate keymasks up along singleton chains. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) parent = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) if (!parent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) /* Nix the mask for parents with two children. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) if (node->keymask == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) } else if (parent->left && parent->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) parent->keymask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) parent->keybits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) assert((parent->keymask & node->keymask) == 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) parent->keymask |= node->keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) parent->keymask |= (1 << parent->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) parent->keybits |= node->keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) if (parent->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) parent->keybits |= (1 << parent->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) node = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) * Prune internal nodes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) * Fully populated subtrees that end at the same leaf have already
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) * been collapsed. There are still internal nodes that have for both
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) * their left and right branches a sequence of singletons that make
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) * identical choices and end in identical leaves. The keymask and
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) * keybits collected in the nodes describe the choices made in these
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * singleton chains. When they are identical for the left and right
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) * branch of a node, and the two leaves comare identical, the node in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * question can be removed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) * Note that nodes with the nextbyte tag set will not be removed by
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * this to ensure tree integrity. Note as well that the structure of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) * utf8 ensures that these nodes would not have been candidates for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) * removal in any case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) static void prune(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) struct node *left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) struct node *right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) struct node *parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) void *leftleaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) void *rightleaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) printf("Pruning %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) if (tree->childnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) if (!tree->root)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) if (node->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) if (node->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) if (!node->left)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) if (!node->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) if (left->keymask == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) if (right->keymask == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) if (left->keymask != right->keymask)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) if (left->keybits != right->keybits)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) leftleaf = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) while (!leftleaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) assert(left->left || left->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) if (left->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) leftleaf = left->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) else if (left->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) leftleaf = left->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) else if (left->left)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) left = left->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) else if (left->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) left = left->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) rightleaf = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) while (!rightleaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) assert(right->left || right->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) if (right->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) rightleaf = right->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) else if (right->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) rightleaf = right->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) else if (right->left)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) right = right->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) else if (right->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) right = right->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) if (! tree->leaf_equal(leftleaf, rightleaf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) goto advance;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) * This node has identical singleton-only subtrees.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) * Remove it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) parent = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) if (parent->left == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) parent->left = left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) else if (parent->right == node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) parent->right = left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) left->parent = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) left->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) node->left = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) if (node->leftnode == NODE && node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) free(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) node = left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) } else if (node->rightnode == NODE && node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) free(node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) node = right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) node = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) /* Propagate keymasks up along singleton chains. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) node = parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) /* Force re-check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) if (node->left && node->right)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773) left = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) node->keymask |= left->keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) node->keybits |= left->keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) node->keymask |= right->keymask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) node->keybits |= right->keybits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) node->keymask |= (1 << node->bitnum);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) /* Force re-check */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) advance:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) if ((leftmask & bitmask) == 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) node->leftnode == NODE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) } else if ((rightmask & bitmask) == 0 &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) node->rightnode == NODE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) printf("Pruned %d nodes\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) * Mark the nodes in the tree that lead to leaves that must be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) * emitted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) static void mark_nodes(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) struct node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) int marked;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) marked = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) printf("Marking %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) if (tree->childnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) if ((leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) if (tree->leaf_mark(node->left)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) if ((rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) if (tree->leaf_mark(node->right)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) /* second pass: left siblings and singletons */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) if ((leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) if (tree->leaf_mark(node->left)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) if (!node->mark && node->parent->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) node->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) if ((rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) if (tree->leaf_mark(node->right)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) n = node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) while (n && !n->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) n->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) n = n->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) if (!node->mark && node->parent->mark &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) !node->parent->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) marked++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) node->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) printf("Marked %d nodes\n", marked);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) * Compute the index of each node and leaf, which is the offset in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) * emitted trie. These values must be pre-computed because relative
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) * offsets between nodes are used to navigate the tree.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) static int index_nodes(struct tree *tree, int index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) int indent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) /* Align to a cache line (or half a cache line?). */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) while (index % 64)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) tree->index = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) if (tree->childnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) index += tree->leaf_size(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) if (!node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) if (node->index != index)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) node->index = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) index += node->size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) if (node->mark && (leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) *tree->leaf_index(tree, node->left) =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983) index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) index += tree->leaf_size(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) if (node->mark && (rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) *tree->leaf_index(tree, node->right) = index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) index += tree->leaf_size(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) /* Round up to a multiple of 16 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) while (index % 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) printf("Final index %d\n", index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) return index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) * Mark the nodes in a subtree, helper for size_nodes().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) static int mark_subtree(struct node *node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) int changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) if (!node || node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) node->mark = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) node->index = node->parent->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) changed = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) if (node->leftnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) changed += mark_subtree(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) if (node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) changed += mark_subtree(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) return changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) * Compute the size of nodes and leaves. We start by assuming that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) * each node needs to store a three-byte offset. The indexes of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) * nodes are calculated based on that, and then this function is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) * called to see if the sizes of some nodes can be reduced. This is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) * repeated until no more changes are seen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) static int size_nodes(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) struct tree *next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) struct node *right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) struct node *n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) unsigned int pathbits;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) unsigned int pathmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) unsigned int nbit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) int changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) int indent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) changed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) printf("Sizing %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) if (tree->childnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) pathbits = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) pathmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) if (!node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) offset = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) if (!node->left || !node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) size = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) if (node->rightnode == NODE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) * If the right node is not marked,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) * look for a corresponding node in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) * the next tree. Such a node need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) * not exist.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) right = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) next = tree->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) while (!right->mark) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) assert(next);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) n = next->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) while (n->bitnum != node->bitnum) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) nbit = 1 << n->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) if (!(pathmask & nbit))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) if (pathbits & nbit) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) if (n->rightnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) n = n->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) if (n->leftnode == LEAF)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) n = n->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) if (n->bitnum != node->bitnum)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) n = n->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) right = n;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) next = next->next;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) /* Make sure the right node is marked. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) if (!right->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) changed += mark_subtree(right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) offset = right->index - node->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) offset = *tree->leaf_index(tree, node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) offset -= node->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) assert(offset >= 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) assert(offset <= 0xffffff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) if (offset <= 0xff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) size = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) } else if (offset <= 0xffff) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) size = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) } else { /* offset <= 0xffffff */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) size = 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) if (node->size != size || node->offset != offset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) node->size = size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) node->offset = offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) changed++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) pathmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) if (node->mark && (leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) if (node->mark && (rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) pathbits |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) pathmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) pathbits &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) printf("Found %d changes\n", changed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) return changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) * Emit a trie for the given tree into the data array.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) static void emit(struct tree *tree, unsigned char *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) struct node *node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) unsigned int leftmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) unsigned int rightmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) unsigned int bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) int offlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) int index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) int indent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) int bytes;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) int leaves;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) int nodes[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) unsigned char byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) leaves = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) bytes = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) index = tree->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) data += index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) indent = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) printf("Emitting %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) if (tree->childnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) assert(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) tree->leaf_emit(tree->root, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) size = tree->leaf_size(tree->root);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) index += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) leaves++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) goto done;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) assert(tree->childnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) node = tree->root;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) leftmask = rightmask = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) if (!node->mark)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) goto skip;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) assert(node->offset != -1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) assert(node->index == index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) byte = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) if (node->nextbyte)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) byte |= NEXTBYTE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) byte |= (node->bitnum & BITNUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) if (node->left && node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) if (node->leftnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) byte |= LEFTNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) if (node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) byte |= RIGHTNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236) if (node->offset <= 0xff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) offlen = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) else if (node->offset <= 0xffff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) offlen = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) offlen = 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) nodes[offlen]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243) offset = node->offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) byte |= offlen << OFFLEN_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) *data++ = byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) while (offlen--) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) *data++ = offset & 0xff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250) offset >>= 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) if (node->leftnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) byte |= TRIENODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) nodes[0]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) *data++ = byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259) byte |= RIGHTNODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) if (node->rightnode == NODE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) byte |= TRIENODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) nodes[0]++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) *data++ = byte;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) index++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) assert(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) skip:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) bitmask = 1 << node->bitnum;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) if (node->mark && (leftmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) leftmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) if (node->leftnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274) assert(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) data = tree->leaf_emit(node->left,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) size = tree->leaf_size(node->left);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) index += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) bytes += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280) leaves++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) } else if (node->left) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) assert(node->leftnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) node = node->left;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) if (node->mark && (rightmask & bitmask) == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) rightmask |= bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) if (node->rightnode == LEAF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) assert(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) data = tree->leaf_emit(node->right,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293) data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) size = tree->leaf_size(node->right);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) index += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) bytes += size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) leaves++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) } else if (node->right) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) assert(node->rightnode == NODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) indent += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301) node = node->right;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) leftmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) rightmask &= ~bitmask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) node = node->parent;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308) indent -= 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) done:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312) if (verbose > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) printf("Emitted %d (%d) leaves",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) leaves, bytes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) printf(" %d (%d+%d+%d+%d) nodes",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) nodes[0] + nodes[1] + nodes[2] + nodes[3],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) nodes[0], nodes[1], nodes[2], nodes[3]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) printf(" %d total\n", index - tree->index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) * Unicode data.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) * We need to keep track of the Canonical Combining Class, the Age,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) * and decompositions for a code point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) * For the Age, we store the index into the ages table. Effectively
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) * this is a generation number that the table maps to a unicode
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) * version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) * The correction field is used to indicate that this entry is in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) * corrections array, which contains decompositions that were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) * corrected in later revisions. The value of the correction field is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) * the Unicode version in which the mapping was corrected.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) struct unicode_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) unsigned int code;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) int gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) int correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) unsigned int *utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) unsigned int *utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) char *utf8nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) char *utf8nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) struct unicode_data unicode_data[0x110000];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351) struct unicode_data *corrections;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) int corrections_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354) struct tree *nfdi_tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) struct tree *nfdicf_tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) struct tree *trees;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358) int trees_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) * Check the corrections array to see if this entry was corrected at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) * some point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) static struct unicode_data *corrections_lookup(struct unicode_data *u)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) for (i = 0; i != corrections_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) if (u->code == corrections[i].code)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) return &corrections[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) return u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) static int nfdi_equal(void *l, void *r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) struct unicode_data *left = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) struct unicode_data *right = r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) if (left->gen != right->gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) if (left->ccc != right->ccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) if (left->utf8nfdi && right->utf8nfdi &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) if (left->utf8nfdi || right->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) static int nfdicf_equal(void *l, void *r)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) struct unicode_data *left = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) struct unicode_data *right = r;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) if (left->gen != right->gen)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) if (left->ccc != right->ccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) if (left->utf8nfdicf && right->utf8nfdicf &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) if (left->utf8nfdicf && right->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) if (left->utf8nfdicf || right->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407) if (left->utf8nfdi && right->utf8nfdi &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) if (left->utf8nfdi || right->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) static void nfdi_print(void *l, int indent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) leaf->code, leaf->ccc, leaf->gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) static void nfdicf_print(void *l, int indent)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) leaf->code, leaf->ccc, leaf->gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) if (leaf->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438) printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441) else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) static int nfdi_mark(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451) static int nfdicf_mark(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455) if (leaf->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) static int correction_mark(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) return leaf->correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) static int nfdi_size(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) int size = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) if (HANGUL_SYLLABLE(leaf->code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) size += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) size += strlen(leaf->utf8nfdi) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479) static int nfdicf_size(void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482) int size = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) if (HANGUL_SYLLABLE(leaf->code))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) size += 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486) else if (leaf->utf8nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) size += strlen(leaf->utf8nfdicf) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488) else if (leaf->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) size += strlen(leaf->utf8nfdi) + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490) return size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) static int *nfdi_index(struct tree *tree, void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) return &tree->leafindex[leaf->code];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) static int *nfdicf_index(struct tree *tree, void *l)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) return &tree->leafindex[leaf->code];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) static unsigned char *nfdi_emit(void *l, unsigned char *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) unsigned char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) *data++ = leaf->gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) if (HANGUL_SYLLABLE(leaf->code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) *data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) *data++ = HANGUL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) } else if (leaf->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) *data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) s = (unsigned char*)leaf->utf8nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) while ((*data++ = *s++) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) *data++ = leaf->ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) return data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) static unsigned char *nfdicf_emit(void *l, unsigned char *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) struct unicode_data *leaf = l;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) unsigned char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) *data++ = leaf->gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) if (HANGUL_SYLLABLE(leaf->code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) *data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) *data++ = HANGUL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) } else if (leaf->utf8nfdicf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) *data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) s = (unsigned char*)leaf->utf8nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) while ((*data++ = *s++) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1542) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1543) } else if (leaf->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1544) *data++ = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1545) s = (unsigned char*)leaf->utf8nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1546) while ((*data++ = *s++) != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1547) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1548) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1549) *data++ = leaf->ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1550) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1551) return data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1552) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1554) static void utf8_create(struct unicode_data *data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1555) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1556) char utf[18*4+1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1557) char *u;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1558) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1559) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1560)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1561) if (data->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1562) assert(data->utf8nfdi[0] == HANGUL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1563) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1564) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1565)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1566) u = utf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1567) um = data->utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1568) if (um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1569) for (i = 0; um[i]; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1570) u += utf8encode(u, um[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1571) *u = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1572) data->utf8nfdi = strdup(utf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1573) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1574) u = utf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1575) um = data->utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1576) if (um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1577) for (i = 0; um[i]; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1578) u += utf8encode(u, um[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1579) *u = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1580) if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1581) data->utf8nfdicf = strdup(utf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1582) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1585) static void utf8_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1586) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1587) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1588) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1590) for (unichar = 0; unichar != 0x110000; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1591) utf8_create(&unicode_data[unichar]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1593) for (i = 0; i != corrections_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1594) utf8_create(&corrections[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1597) static void trees_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1598) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1599) struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1600) unsigned int maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1601) unsigned int nextage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1602) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1603) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1604) int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1605)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1606) /* Count the number of different ages. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1607) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1608) nextage = (unsigned int)-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1609) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1610) maxage = nextage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1611) nextage = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1612) for (i = 0; i <= corrections_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1613) data = &corrections[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1614) if (nextage < data->correction &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1615) data->correction < maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1616) nextage = data->correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1617) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1618) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1619) } while (nextage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1620)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1621) /* Two trees per age: nfdi and nfdicf */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1622) trees_count = count * 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1623) trees = calloc(trees_count, sizeof(struct tree));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1625) /* Assign ages to the trees. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1626) count = trees_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1627) nextage = (unsigned int)-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1628) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1629) maxage = nextage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1630) trees[--count].maxage = maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1631) trees[--count].maxage = maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1632) nextage = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1633) for (i = 0; i <= corrections_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1634) data = &corrections[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1635) if (nextage < data->correction &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1636) data->correction < maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1637) nextage = data->correction;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1638) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1639) } while (nextage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1641) /* The ages assigned above are off by one. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1642) for (i = 0; i != trees_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1643) j = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1644) while (ages[j] < trees[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1645) j++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1646) trees[i].maxage = ages[j-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1647) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1648)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1649) /* Set up the forwarding between trees. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1650) trees[trees_count-2].next = &trees[trees_count-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1651) trees[trees_count-1].leaf_mark = nfdi_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1652) trees[trees_count-2].leaf_mark = nfdicf_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1653) for (i = 0; i != trees_count-2; i += 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1654) trees[i].next = &trees[trees_count-2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1655) trees[i].leaf_mark = correction_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1656) trees[i+1].next = &trees[trees_count-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1657) trees[i+1].leaf_mark = correction_mark;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1658) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1660) /* Assign the callouts. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1661) for (i = 0; i != trees_count; i += 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1662) trees[i].type = "nfdicf";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1663) trees[i].leaf_equal = nfdicf_equal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1664) trees[i].leaf_print = nfdicf_print;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1665) trees[i].leaf_size = nfdicf_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1666) trees[i].leaf_index = nfdicf_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1667) trees[i].leaf_emit = nfdicf_emit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1669) trees[i+1].type = "nfdi";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1670) trees[i+1].leaf_equal = nfdi_equal;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1671) trees[i+1].leaf_print = nfdi_print;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1672) trees[i+1].leaf_size = nfdi_size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1673) trees[i+1].leaf_index = nfdi_index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1674) trees[i+1].leaf_emit = nfdi_emit;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1675) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1676)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1677) /* Finish init. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1678) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1679) trees[i].childnode = NODE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1682) static void trees_populate(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1683) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1684) struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1685) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1686) char keyval[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1687) int keylen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1688) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1689)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1690) for (i = 0; i != trees_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1691) if (verbose > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1692) printf("Populating %s_%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1693) trees[i].type, trees[i].maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1694) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1695) for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1696) if (unicode_data[unichar].gen < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1697) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1698) keylen = utf8encode(keyval, unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1699) data = corrections_lookup(&unicode_data[unichar]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1700) if (data->correction <= trees[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1701) data = &unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1702) insert(&trees[i], keyval, keylen, data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1703) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1704) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1707) static void trees_reduce(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1708) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1709) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1710) int size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1711) int changed;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1713) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1714) prune(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1715) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1716) mark_nodes(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1717) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1718) size = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1719) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1720) size = index_nodes(&trees[i], size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1721) changed = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1722) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1723) changed += size_nodes(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1724) } while (changed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1725)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1726) utf8data = calloc(size, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1727) utf8data_size = size;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1728) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1729) emit(&trees[i], utf8data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1730)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1731) if (verbose > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1732) for (i = 0; i != trees_count; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1733) printf("%s_%x idx %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1734) trees[i].type, trees[i].maxage, trees[i].index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1735) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1736) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1738) nfdi = utf8data + trees[trees_count-1].index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1739) nfdicf = utf8data + trees[trees_count-2].index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1740)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1741) nfdi_tree = &trees[trees_count-1];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1742) nfdicf_tree = &trees[trees_count-2];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1743) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1745) static void verify(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1746) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1747) struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1748) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1749) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1750) char key[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1751) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1752) int report;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1753) int nocf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1754)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1755) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1756) printf("Verifying %s_%x\n", tree->type, tree->maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1757) nocf = strcmp(tree->type, "nfdicf");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1758)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1759) for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1760) report = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1761) data = corrections_lookup(&unicode_data[unichar]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1762) if (data->correction <= tree->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1763) data = &unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1764) utf8encode(key,unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1765) leaf = utf8lookup(tree, hangul, key);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1766)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1767) if (!leaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1768) if (data->gen != -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1769) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1770) if (unichar < 0xd800 || unichar > 0xdfff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1771) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1772) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1773) if (unichar >= 0xd800 && unichar <= 0xdfff)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1774) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1775) if (data->gen == -1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1776) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1777) if (data->gen != LEAF_GEN(leaf))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1778) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1779) if (LEAF_CCC(leaf) == DECOMPOSE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1780) if (HANGUL_SYLLABLE(data->code)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1781) if (data->utf8nfdi[0] != HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1782) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1783) } else if (nocf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1784) if (!data->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1785) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1786) } else if (strcmp(data->utf8nfdi,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1787) LEAF_STR(leaf))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1788) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1789) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1790) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1791) if (!data->utf8nfdicf &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1792) !data->utf8nfdi) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1793) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1794) } else if (data->utf8nfdicf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1795) if (strcmp(data->utf8nfdicf,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1796) LEAF_STR(leaf)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1797) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1798) } else if (strcmp(data->utf8nfdi,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1799) LEAF_STR(leaf))) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1800) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1801) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1802) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1803) } else if (data->ccc != LEAF_CCC(leaf)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1804) report++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1805) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1806) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1807) if (report) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1808) printf("%X code %X gen %d ccc %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1809) " nfdi -> \"%s\"",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1810) unichar, data->code, data->gen,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1811) data->ccc,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1812) data->utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1813) if (leaf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1814) printf(" gen %d ccc %d"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1815) " nfdi -> \"%s\"",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1816) LEAF_GEN(leaf),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1817) LEAF_CCC(leaf),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1818) LEAF_CCC(leaf) == DECOMPOSE ?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1819) LEAF_STR(leaf) : "");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1820) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1821) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1822) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1824) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1825)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1826) static void trees_verify(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1827) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1828) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1830) for (i = 0; i != trees_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1831) verify(&trees[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1832) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1833)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1834) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1835)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1836) static void help(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1837) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1838) printf("Usage: %s [options]\n", argv0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1839) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1840) printf("This program creates an a data trie used for parsing and\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1841) printf("normalization of UTF-8 strings. The trie is derived from\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1842) printf("a set of input files from the Unicode character database\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1843) printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1844) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1845) printf("The generated tree supports two normalization forms:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1846) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1847) printf("\tnfdi:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1848) printf("\t- Apply unicode normalization form NFD.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1849) printf("\t- Remove any Default_Ignorable_Code_Point.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1850) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1851) printf("\tnfdicf:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1852) printf("\t- Apply unicode normalization form NFD.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1853) printf("\t- Remove any Default_Ignorable_Code_Point.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1854) printf("\t- Apply a full casefold (C + F).\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1855) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1856) printf("These forms were chosen as being most useful when dealing\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1857) printf("with file names: NFD catches most cases where characters\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1858) printf("should be considered equivalent. The ignorables are mostly\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1859) printf("invisible, making names hard to type.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1860) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1861) printf("The options to specify the files to be used are listed\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1862) printf("below with their default values, which are the names used\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1863) printf("by version 11.0.0 of the Unicode Character Database.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1864) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1865) printf("The input files:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1866) printf("\t-a %s\n", AGE_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1867) printf("\t-c %s\n", CCC_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1868) printf("\t-p %s\n", PROP_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1869) printf("\t-d %s\n", DATA_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1870) printf("\t-f %s\n", FOLD_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1871) printf("\t-n %s\n", NORM_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1872) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1873) printf("Additionally, the generated tables are tested using:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1874) printf("\t-t %s\n", TEST_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1875) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1876) printf("Finally, the output file:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1877) printf("\t-o %s\n", UTF8_NAME);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1878) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1879) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1881) static void usage(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1882) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1883) help();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1884) exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1885) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1886)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1887) static void open_fail(const char *name, int error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1888) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1889) printf("Error %d opening %s: %s\n", error, name, strerror(error));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1890) exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1891) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1892)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1893) static void file_fail(const char *filename)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1894) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1895) printf("Error parsing %s\n", filename);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1896) exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1897) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1899) static void line_fail(const char *filename, const char *line)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1900) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1901) printf("Error parsing %s:%s\n", filename, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1902) exit(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1903) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1904)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1905) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1907) static void print_utf32(unsigned int *utf32str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1908) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1909) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1911) for (i = 0; utf32str[i]; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1912) printf(" %X", utf32str[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1913) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1914)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1915) static void print_utf32nfdi(unsigned int unichar)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1916) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1917) printf(" %X ->", unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1918) print_utf32(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1919) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1920) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1921)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1922) static void print_utf32nfdicf(unsigned int unichar)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1923) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1924) printf(" %X ->", unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1925) print_utf32(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1926) printf("\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1927) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1928)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1929) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1930)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1931) static void age_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1932) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1933) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1934) unsigned int first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1935) unsigned int last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1936) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1937) unsigned int major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1938) unsigned int minor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1939) unsigned int revision;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1940) int gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1941) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1942) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1944) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1945) printf("Parsing %s\n", age_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1946)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1947) file = fopen(age_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1948) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1949) open_fail(age_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1950) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1951)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1952) gen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1953) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1954) ret = sscanf(line, "# Age=V%d_%d_%d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1955) &major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1956) if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1957) ages_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1958) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1959) printf(" Age V%d_%d_%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1960) major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1961) if (!age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1962) line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1963) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1965) ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1966) if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1967) ages_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1968) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1969) printf(" Age V%d_%d\n", major, minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1970) if (!age_valid(major, minor, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1971) line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1972) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1973) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1974) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1975)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1976) /* We must have found something above. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1977) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1978) printf("%d age entries\n", ages_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1979) if (ages_count == 0 || ages_count > MAXGEN)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1980) file_fail(age_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1981)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1982) /* There is a 0 entry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1983) ages_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1984) ages = calloc(ages_count + 1, sizeof(*ages));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1985) /* And a guard entry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1986) ages[ages_count] = (unsigned int)-1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1987)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1988) rewind(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1989) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1990) gen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1991) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1992) ret = sscanf(line, "# Age=V%d_%d_%d",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1993) &major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1994) if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1995) ages[++gen] =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1996) UNICODE_AGE(major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1997) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1998) printf(" Age V%d_%d_%d = gen %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1999) major, minor, revision, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2000) if (!age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2001) line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2002) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2003) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2004) ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2005) if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2006) ages[++gen] = UNICODE_AGE(major, minor, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2007) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2008) printf(" Age V%d_%d = %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2009) major, minor, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2010) if (!age_valid(major, minor, 0))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2011) line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2012) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2014) ret = sscanf(line, "%X..%X ; %d.%d #",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2015) &first, &last, &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2016) if (ret == 4) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2017) for (unichar = first; unichar <= last; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2018) unicode_data[unichar].gen = gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2019) count += 1 + last - first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2020) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2021) printf(" %X..%X gen %d\n", first, last, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2022) if (!utf32valid(first) || !utf32valid(last))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2023) line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2024) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2026) ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2027) if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2028) unicode_data[unichar].gen = gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2029) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2030) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2031) printf(" %X gen %d\n", unichar, gen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2032) if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2033) line_fail(age_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2034) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2035) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2036) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2037) unicode_maxage = ages[gen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2038) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2039)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2040) /* Nix surrogate block */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2041) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2042) printf(" Removing surrogate block D800..DFFF\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2043) for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2044) unicode_data[unichar].gen = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2045)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2046) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2047) printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2048) if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2049) file_fail(age_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2050) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2051)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2052) static void ccc_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2053) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2054) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2055) unsigned int first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2056) unsigned int last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2057) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2058) unsigned int value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2059) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2060) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2062) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2063) printf("Parsing %s\n", ccc_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2064)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2065) file = fopen(ccc_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2066) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2067) open_fail(ccc_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2069) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2070) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2071) ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2072) if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2073) for (unichar = first; unichar <= last; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2074) unicode_data[unichar].ccc = value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2075) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2076) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2077) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2078) printf(" %X..%X ccc %d\n", first, last, value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2079) if (!utf32valid(first) || !utf32valid(last))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2080) line_fail(ccc_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2081) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2082) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2083) ret = sscanf(line, "%X ; %d #", &unichar, &value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2084) if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2085) unicode_data[unichar].ccc = value;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2086) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2087) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2088) printf(" %X ccc %d\n", unichar, value);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2089) if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2090) line_fail(ccc_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2091) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2092) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2093) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2094) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2096) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2097) printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2098) if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2099) file_fail(ccc_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2100) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2102) static int ignore_compatibility_form(char *type)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2103) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2104) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2105) char *ignored_types[] = {"font", "noBreak", "initial", "medial",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2106) "final", "isolated", "circle", "super",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2107) "sub", "vertical", "wide", "narrow",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2108) "small", "square", "fraction", "compat"};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2110) for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2111) if (strcmp(type, ignored_types[i]) == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2112) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2113) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2116) static void nfdi_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2117) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2118) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2119) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2120) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2121) char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2122) char *type;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2123) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2124) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2125) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2126) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2128) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2129) printf("Parsing %s\n", data_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2130) file = fopen(data_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2131) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2132) open_fail(data_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2133)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2134) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2135) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2136) ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2137) &unichar, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2138) if (ret != 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2139) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2140) if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2141) line_fail(data_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2142)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2143) s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2144) /* skip over <tag> */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2145) if (*s == '<') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2146) type = ++s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2147) while (*++s != '>');
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2148) *s++ = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2149) if(ignore_compatibility_form(type))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2150) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2151) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2152) /* decode the decomposition into UTF-32 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2153) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2154) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2155) mapping[i] = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2156) if (!utf32valid(mapping[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2157) line_fail(data_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2158) i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2159) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2160) mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2162) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2163) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2164) unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2166) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2167) print_utf32nfdi(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2168) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2169) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2170) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2171) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2172) printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2173) if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2174) file_fail(data_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2175) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2176)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2177) static void nfdicf_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2178) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2179) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2180) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2181) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2182) char status;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2183) char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2184) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2185) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2186) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2187) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2189) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2190) printf("Parsing %s\n", fold_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2191) file = fopen(fold_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2192) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2193) open_fail(fold_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2195) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2196) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2197) ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2198) if (ret != 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2199) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2200) if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2201) line_fail(fold_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2202) /* Use the C+F casefold. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2203) if (status != 'C' && status != 'F')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2204) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2205) s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2206) if (*s == '<')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2207) while (*s++ != ' ')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2208) ;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2209) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2210) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2211) mapping[i] = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2212) if (!utf32valid(mapping[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2213) line_fail(fold_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2214) i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2216) mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2218) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2219) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2220) unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2222) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2223) print_utf32nfdicf(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2224) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2225) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2226) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2227) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2228) printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2229) if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2230) file_fail(fold_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2231) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2233) static void ignore_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2234) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2235) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2236) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2237) unsigned int first;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2238) unsigned int last;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2239) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2240) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2241) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2243) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2244) printf("Parsing %s\n", prop_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2245) file = fopen(prop_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2246) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2247) open_fail(prop_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2248) assert(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2249) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2250) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2251) ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2252) if (ret == 3) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2253) if (strcmp(buf0, "Default_Ignorable_Code_Point"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2254) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2255) if (!utf32valid(first) || !utf32valid(last))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2256) line_fail(prop_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2257) for (unichar = first; unichar <= last; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2258) free(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2259) um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2260) *um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2261) unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2262) free(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2263) um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2264) *um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2265) unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2266) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2267) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2268) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2269) printf(" %X..%X Default_Ignorable_Code_Point\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2270) first, last);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2271) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2272) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2273) ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2274) if (ret == 2) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2275) if (strcmp(buf0, "Default_Ignorable_Code_Point"))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2276) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2277) if (!utf32valid(unichar))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2278) line_fail(prop_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2279) free(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2280) um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2281) *um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2282) unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2283) free(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2284) um = malloc(sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2285) *um = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2286) unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2287) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2288) printf(" %X Default_Ignorable_Code_Point\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2289) unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2290) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2291) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2293) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2294) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2295)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2296) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2297) printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2298) if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2299) file_fail(prop_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2302) static void corrections_init(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2303) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2304) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2305) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2306) unsigned int major;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2307) unsigned int minor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2308) unsigned int revision;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2309) unsigned int age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2310) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2311) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2312) char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2313) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2314) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2315) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2317) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2318) printf("Parsing %s\n", norm_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2319) file = fopen(norm_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2320) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2321) open_fail(norm_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2322)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2323) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2324) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2325) ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2326) &unichar, buf0, buf1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2327) &major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2328) if (ret != 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2329) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2330) if (!utf32valid(unichar) || !age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2331) line_fail(norm_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2332) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2333) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2334) corrections = calloc(count, sizeof(struct unicode_data));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2335) corrections_count = count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2336) rewind(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2337)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2338) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2339) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2340) ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2341) &unichar, buf0, buf1,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2342) &major, &minor, &revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2343) if (ret != 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2344) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2345) if (!utf32valid(unichar) || !age_valid(major, minor, revision))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2346) line_fail(norm_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2347) corrections[count] = unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2348) assert(corrections[count].code == unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2349) age = UNICODE_AGE(major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2350) corrections[count].correction = age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2352) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2353) s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2354) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2355) mapping[i] = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2356) if (!utf32valid(mapping[i]))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2357) line_fail(norm_name, line);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2358) i++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2359) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2360) mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2361)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2362) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2363) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2364) corrections[count].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2365)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2366) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2367) printf(" %X -> %s -> %s V%d_%d_%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2368) unichar, buf0, buf1, major, minor, revision);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2369) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2370) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2371) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2373) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2374) printf("Found %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2375) if (count == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2376) file_fail(norm_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2379) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2381) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2382) * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2383) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2384) * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2385) * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2386) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2387) * SBase = 0xAC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2388) * LBase = 0x1100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2389) * VBase = 0x1161
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2390) * TBase = 0x11A7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2391) * LCount = 19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2392) * VCount = 21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2393) * TCount = 28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2394) * NCount = 588 (VCount * TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2395) * SCount = 11172 (LCount * NCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2396) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2397) * Decomposition:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2398) * SIndex = s - SBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2399) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2400) * LV (Canonical/Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2401) * LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2402) * VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2403) * LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2404) * VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2405) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2406) * LVT (Canonical)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2407) * LVIndex = (SIndex / TCount) * TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2408) * TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2409) * LVPart = SBase + LVIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2410) * TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2411) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2412) * LVT (Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2413) * LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2414) * VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2415) * TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2416) * LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2417) * VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2418) * if (TIndex == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2419) * d = <LPart, VPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2420) * } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2421) * TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2422) * d = <LPart, VPart, TPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2423) * }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2424) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2425) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2426)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2427) static void hangul_decompose(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2428) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2429) unsigned int sb = 0xAC00;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2430) unsigned int lb = 0x1100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2431) unsigned int vb = 0x1161;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2432) unsigned int tb = 0x11a7;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2433) /* unsigned int lc = 19; */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2434) unsigned int vc = 21;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2435) unsigned int tc = 28;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2436) unsigned int nc = (vc * tc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2437) /* unsigned int sc = (lc * nc); */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2438) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2439) unsigned int mapping[4];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2440) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2441) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2442) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2443)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2444) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2445) printf("Decomposing hangul\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2446) /* Hangul */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2447) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2448) for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2449) unsigned int si = unichar - sb;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2450) unsigned int li = si / nc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2451) unsigned int vi = (si % nc) / tc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2452) unsigned int ti = si % tc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2454) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2455) mapping[i++] = lb + li;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2456) mapping[i++] = vb + vi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2457) if (ti)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2458) mapping[i++] = tb + ti;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2459) mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2460)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2461) assert(!unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2462) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2463) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2464) unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2465)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2466) assert(!unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2467) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2468) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2469) unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2470)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2471) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2472) * Add a cookie as a reminder that the hangul syllable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2473) * decompositions must not be stored in the generated
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2474) * trie.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2475) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2476) unicode_data[unichar].utf8nfdi = malloc(2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2477) unicode_data[unichar].utf8nfdi[0] = HANGUL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2478) unicode_data[unichar].utf8nfdi[1] = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2480) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2481) print_utf32nfdi(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2483) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2484) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2485) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2486) printf("Created %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2487) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2489) static void nfdi_decompose(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2490) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2491) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2492) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2493) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2494) unsigned int *dc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2495) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2496) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2497) int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2498) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2500) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2501) printf("Decomposing nfdi\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2502)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2503) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2504) for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2505) if (!unicode_data[unichar].utf32nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2506) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2507) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2508) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2509) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2510) um = unicode_data[unichar].utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2511) while (*um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2512) dc = unicode_data[*um].utf32nfdi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2513) if (dc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2514) for (j = 0; dc[j]; j++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2515) mapping[i++] = dc[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2516) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2517) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2518) mapping[i++] = *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2519) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2520) um++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2521) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2522) mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2523) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2524) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2525) free(unicode_data[unichar].utf32nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2526) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2527) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2528) unicode_data[unichar].utf32nfdi = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2529) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2530) /* Add this decomposition to nfdicf if there is no entry. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2531) if (!unicode_data[unichar].utf32nfdicf) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2532) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2533) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2534) unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2535) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2536) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2537) print_utf32nfdi(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2538) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2540) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2541) printf("Processed %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2542) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2544) static void nfdicf_decompose(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2545) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2546) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2547) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2548) unsigned int *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2549) unsigned int *dc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2550) int count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2551) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2552) int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2553) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2554)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2555) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2556) printf("Decomposing nfdicf\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2557) count = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2558) for (unichar = 0; unichar != 0x110000; unichar++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2559) if (!unicode_data[unichar].utf32nfdicf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2560) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2561) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2562) ret = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2563) i = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2564) um = unicode_data[unichar].utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2565) while (*um) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2566) dc = unicode_data[*um].utf32nfdicf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2567) if (dc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2568) for (j = 0; dc[j]; j++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2569) mapping[i++] = dc[j];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2570) ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2571) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2572) mapping[i++] = *um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2573) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2574) um++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2575) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2576) mapping[i++] = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2577) if (ret)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2578) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2579) free(unicode_data[unichar].utf32nfdicf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2580) um = malloc(i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2581) memcpy(um, mapping, i * sizeof(unsigned int));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2582) unicode_data[unichar].utf32nfdicf = um;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2583) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2584) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2585) print_utf32nfdicf(unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2586) count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2587) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2588) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2589) printf("Processed %d entries\n", count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2590) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2591)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2592) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2593)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2594) int utf8agemax(struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2595) int utf8nagemax(struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2596) int utf8agemin(struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2597) int utf8nagemin(struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2598) ssize_t utf8len(struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2599) ssize_t utf8nlen(struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2600) struct utf8cursor;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2601) int utf8cursor(struct utf8cursor *, struct tree *, const char *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2602) int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2603) int utf8byte(struct utf8cursor *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2604)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2605) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2606) * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2607) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2608) * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2609) * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2610) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2611) * SBase = 0xAC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2612) * LBase = 0x1100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2613) * VBase = 0x1161
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2614) * TBase = 0x11A7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2615) * LCount = 19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2616) * VCount = 21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2617) * TCount = 28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2618) * NCount = 588 (VCount * TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2619) * SCount = 11172 (LCount * NCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2620) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2621) * Decomposition:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2622) * SIndex = s - SBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2623) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2624) * LV (Canonical/Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2625) * LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2626) * VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2627) * LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2628) * VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2629) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2630) * LVT (Canonical)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2631) * LVIndex = (SIndex / TCount) * TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2632) * TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2633) * LVPart = SBase + LVIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2634) * TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2635) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2636) * LVT (Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2637) * LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2638) * VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2639) * TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2640) * LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2641) * VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2642) * if (TIndex == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2643) * d = <LPart, VPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2644) * } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2645) * TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2646) * d = <LPart, VPart, TPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2647) * }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2648) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2650) /* Constants */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2651) #define SB (0xAC00)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2652) #define LB (0x1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2653) #define VB (0x1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2654) #define TB (0x11A7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2655) #define LC (19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2656) #define VC (21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2657) #define TC (28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2658) #define NC (VC * TC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2659) #define SC (LC * NC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2660)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2661) /* Algorithmic decomposition of hangul syllable. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2662) static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2663) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2664) unsigned int si;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2665) unsigned int li;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2666) unsigned int vi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2667) unsigned int ti;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2668) unsigned char *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2669)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2670) /* Calculate the SI, LI, VI, and TI values. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2671) si = utf8decode(str) - SB;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2672) li = si / NC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2673) vi = (si % NC) / TC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2674) ti = si % TC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2675)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2676) /* Fill in base of leaf. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2677) h = hangul;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2678) LEAF_GEN(h) = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2679) LEAF_CCC(h) = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2680) h += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2682) /* Add LPart, a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2683) h += utf8encode((char *)h, li + LB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2684)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2685) /* Add VPart, a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2686) h += utf8encode((char *)h, vi + VB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2688) /* Add TPart if required, also a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2689) if (ti)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2690) h += utf8encode((char *)h, ti + TB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2691)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2692) /* Terminate string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2693) h[0] = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2694)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2695) return hangul;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2696) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2698) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2699) * Use trie to scan s, touching at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2700) * Returns the leaf if one exists, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2701) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2702) * A non-NULL return guarantees that the UTF-8 sequence starting at s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2703) * is well-formed and corresponds to a known unicode code point. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2704) * shorthand for this will be "is valid UTF-8 unicode".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2705) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2706) static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2707) const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2708) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2709) utf8trie_t *trie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2710) int offlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2711) int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2712) int mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2713) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2714)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2715) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2716) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2717) if (len == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2718) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2719) node = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2720) trie = utf8data + tree->index;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2721) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2722) offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2723) if (*trie & NEXTBYTE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2724) if (--len == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2725) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2726) s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2727) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2728) mask = 1 << (*trie & BITNUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2729) if (*s & mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2730) /* Right leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2731) if (offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2732) /* Right node at offset of trie */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2733) node = (*trie & RIGHTNODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2734) offset = trie[offlen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2735) while (--offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2736) offset <<= 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2737) offset |= trie[offlen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2738) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2739) trie += offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2740) } else if (*trie & RIGHTPATH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2741) /* Right node after this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2742) node = (*trie & TRIENODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2743) trie++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2744) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2745) /* No right node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2746) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2747) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2748) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2749) /* Left leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2750) if (offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2751) /* Left node after this node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2752) node = (*trie & LEFTNODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2753) trie += offlen + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2754) } else if (*trie & RIGHTPATH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2755) /* No left node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2756) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2757) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2758) /* Left node after this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2759) node = (*trie & TRIENODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2760) trie++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2762) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2763) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2764) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2765) * Hangul decomposition is done algorithmically. These are the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2766) * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2767) * always 3 bytes long, so s has been advanced twice, and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2768) * start of the sequence is at s-2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2769) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2770) if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2771) trie = utf8hangul(s - 2, hangul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2772) return trie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2773) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2774)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2775) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2776) * Use trie to scan s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2777) * Returns the leaf if one exists, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2778) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2779) * Forwards to trie_nlookup().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2780) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2781) static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2782) const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2783) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2784) return utf8nlookup(tree, hangul, s, (size_t)-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2785) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2786)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2787) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2788) * Return the number of bytes used by the current UTF-8 sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2789) * Assumes the input points to the first byte of a valid UTF-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2790) * sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2791) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2792) static inline int utf8clen(const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2793) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2794) unsigned char c = *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2795) return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2796) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2798) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2799) * Maximum age of any character in s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2800) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2801) * Return 0 if only non-assigned code points are used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2802) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2803) int utf8agemax(struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2804) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2805) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2806) int age = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2807) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2808) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2809)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2810) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2811) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2813) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2814) leaf = utf8lookup(tree, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2815) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2816) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2817) leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2818) if (leaf_age <= tree->maxage && leaf_age > age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2819) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2820) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2821) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2822) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2823) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2825) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2826) * Minimum age of any character in s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2827) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2828) * Return 0 if non-assigned code points are used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2829) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2830) int utf8agemin(struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2831) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2832) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2833) int age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2834) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2835) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2836)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2837) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2838) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2839) age = tree->maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2840) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2841) leaf = utf8lookup(tree, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2842) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2843) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2844) leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2845) if (leaf_age <= tree->maxage && leaf_age < age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2846) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2847) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2848) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2849) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2850) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2851)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2852) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2853) * Maximum age of any character in s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2854) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2855) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2856) int utf8nagemax(struct tree *tree, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2857) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2858) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2859) int age = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2860) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2861) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2862)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2863) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2864) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2865)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2866) while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2867) leaf = utf8nlookup(tree, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2868) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2869) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2870) leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2871) if (leaf_age <= tree->maxage && leaf_age > age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2872) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2873) len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2874) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2875) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2876) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2877) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2878)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2879) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2880) * Maximum age of any character in s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2881) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2882) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2883) int utf8nagemin(struct tree *tree, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2884) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2885) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2886) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2887) int age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2888) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2889)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2890) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2891) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2892) age = tree->maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2893) while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2894) leaf = utf8nlookup(tree, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2895) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2896) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2897) leaf_age = ages[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2898) if (leaf_age <= tree->maxage && leaf_age < age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2899) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2900) len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2901) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2902) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2903) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2904) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2905)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2906) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2907) * Length of the normalization of s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2908) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2909) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2910) * A string of Default_Ignorable_Code_Point has length 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2911) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2912) ssize_t utf8len(struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2913) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2914) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2915) size_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2916) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2917)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2918) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2919) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2920) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2921) leaf = utf8lookup(tree, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2922) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2923) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2924) if (ages[LEAF_GEN(leaf)] > tree->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2925) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2926) else if (LEAF_CCC(leaf) == DECOMPOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2927) ret += strlen(LEAF_STR(leaf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2928) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2929) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2930) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2931) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2932) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2933) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2934)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2935) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2936) * Length of the normalization of s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2937) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2938) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2939) ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2940) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2941) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2942) size_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2943) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2945) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2946) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2947) while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2948) leaf = utf8nlookup(tree, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2949) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2950) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2951) if (ages[LEAF_GEN(leaf)] > tree->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2952) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2953) else if (LEAF_CCC(leaf) == DECOMPOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2954) ret += strlen(LEAF_STR(leaf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2955) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2956) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2957) len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2958) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2959) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2960) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2961) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2962)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2963) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2964) * Cursor structure used by the normalizer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2965) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2966) struct utf8cursor {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2967) struct tree *tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2968) const char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2969) const char *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2970) const char *ss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2971) const char *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2972) unsigned int len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2973) unsigned int slen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2974) short int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2975) short int nccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2976) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2977) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2978) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2979)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2980) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2981) * Set up an utf8cursor for use by utf8byte().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2982) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2983) * s : string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2984) * len : length of s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2985) * u8c : pointer to cursor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2986) * trie : utf8trie_t to use for normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2987) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2988) * Returns -1 on error, 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2989) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2990) int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2991) size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2992) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2993) if (!tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2994) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2995) if (!s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2996) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2997) u8c->tree = tree;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2998) u8c->s = s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2999) u8c->p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3000) u8c->ss = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3001) u8c->sp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3002) u8c->len = len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3003) u8c->slen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3004) u8c->ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3005) u8c->nccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3006) u8c->unichar = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3007) /* Check we didn't clobber the maximum length. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3008) if (u8c->len != len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3009) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3010) /* The first byte of s may not be an utf8 continuation. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3011) if (len > 0 && (*s & 0xC0) == 0x80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3012) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3013) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3014) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3015)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3016) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3017) * Set up an utf8cursor for use by utf8byte().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3018) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3019) * s : NUL-terminated string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3020) * u8c : pointer to cursor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3021) * trie : utf8trie_t to use for normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3022) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3023) * Returns -1 on error, 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3024) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3025) int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3026) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3027) return utf8ncursor(u8c, tree, s, (unsigned int)-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3028) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3029)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3030) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3031) * Get one byte from the normalized form of the string described by u8c.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3032) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3033) * Returns the byte cast to an unsigned char on succes, and -1 on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3034) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3035) * The cursor keeps track of the location in the string in u8c->s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3036) * When a character is decomposed, the current location is stored in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3037) * u8c->p, and u8c->s is set to the start of the decomposition. Note
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3038) * that bytes from a decomposition do not count against u8c->len.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3039) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3040) * Characters are emitted if they match the current CCC in u8c->ccc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3041) * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3042) * and the function returns 0 in that case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3043) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3044) * Sorting by CCC is done by repeatedly scanning the string. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3045) * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3046) * the start of the scan. The first pass finds the lowest CCC to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3047) * emitted and stores it in u8c->nccc, the second pass emits the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3048) * characters with this CCC and finds the next lowest CCC. This limits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3049) * the number of passes to 1 + the number of different CCCs in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3050) * sequence being scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3051) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3052) * Therefore:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3053) * u8c->p != NULL -> a decomposition is being scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3054) * u8c->ss != NULL -> this is a repeating scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3055) * u8c->ccc == -1 -> this is the first scan of a repeating scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3056) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3057) int utf8byte(struct utf8cursor *u8c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3058) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3059) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3060) int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3062) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3063) /* Check for the end of a decomposed character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3064) if (u8c->p && *u8c->s == '\0') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3065) u8c->s = u8c->p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3066) u8c->p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3067) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3069) /* Check for end-of-string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3070) if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3071) /* There is no next byte. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3072) if (u8c->ccc == STOPPER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3073) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3074) /* End-of-string during a scan counts as a stopper. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3075) ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3076) goto ccc_mismatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3077) } else if ((*u8c->s & 0xC0) == 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3078) /* This is a continuation of the current character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3079) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3080) u8c->len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3081) return (unsigned char)*u8c->s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3082) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3083)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3084) /* Look up the data for the current character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3085) if (u8c->p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3086) leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3087) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3088) leaf = utf8nlookup(u8c->tree, u8c->hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3089) u8c->s, u8c->len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3090) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3091)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3092) /* No leaf found implies that the input is a binary blob. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3093) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3094) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3095)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3096) /* Characters that are too new have CCC 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3097) if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3098) ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3099) } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3100) u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3101) u8c->p = u8c->s + utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3102) u8c->s = LEAF_STR(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3103) /* Empty decomposition implies CCC 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3104) if (*u8c->s == '\0') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3105) if (u8c->ccc == STOPPER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3106) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3107) ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3108) goto ccc_mismatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3109) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3110) leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3111) ccc = LEAF_CCC(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3112) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3113) u8c->unichar = utf8decode(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3115) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3116) * If this is not a stopper, then see if it updates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3117) * the next canonical class to be emitted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3118) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3119) if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3120) u8c->nccc = ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3121)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3122) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3123) * Return the current byte if this is the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3124) * combining class.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3125) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3126) if (ccc == u8c->ccc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3127) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3128) u8c->len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3129) return (unsigned char)*u8c->s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3130) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3131)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3132) /* Current combining class mismatch. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3133) ccc_mismatch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3134) if (u8c->nccc == STOPPER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3135) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3136) * Scan forward for the first canonical class
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3137) * to be emitted. Save the position from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3138) * which to restart.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3139) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3140) assert(u8c->ccc == STOPPER);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3141) u8c->ccc = MINCCC - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3142) u8c->nccc = ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3143) u8c->sp = u8c->p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3144) u8c->ss = u8c->s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3145) u8c->slen = u8c->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3146) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3147) u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3148) u8c->s += utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3149) } else if (ccc != STOPPER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3150) /* Not a stopper, and not the ccc we're emitting. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3151) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3152) u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3153) u8c->s += utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3154) } else if (u8c->nccc != MAXCCC + 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3155) /* At a stopper, restart for next ccc. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3156) u8c->ccc = u8c->nccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3157) u8c->nccc = MAXCCC + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3158) u8c->s = u8c->ss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3159) u8c->p = u8c->sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3160) u8c->len = u8c->slen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3161) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3162) /* All done, proceed from here. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3163) u8c->ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3164) u8c->nccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3165) u8c->sp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3166) u8c->ss = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3167) u8c->slen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3168) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3169) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3170) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3171)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3172) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3173)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3174) static int normalize_line(struct tree *tree)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3175) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3176) char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3177) char *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3178) int c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3179) struct utf8cursor u8c;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3180)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3181) /* First test: null-terminated string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3182) s = buf2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3183) t = buf3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3184) if (utf8cursor(&u8c, tree, s))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3185) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3186) while ((c = utf8byte(&u8c)) > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3187) if (c != (unsigned char)*t++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3188) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3189) if (c < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3190) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3191) if (*t != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3192) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3193)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3194) /* Second test: length-limited string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3195) s = buf2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3196) /* Replace NUL with a value that will cause an error if seen. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3197) s[strlen(s) + 1] = -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3198) t = buf3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3199) if (utf8cursor(&u8c, tree, s))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3200) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3201) while ((c = utf8byte(&u8c)) > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3202) if (c != (unsigned char)*t++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3203) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3204) if (c < 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3205) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3206) if (*t != 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3207) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3209) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3210) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3212) static void normalization_test(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3213) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3214) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3215) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3216) struct unicode_data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3217) char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3218) char *t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3219) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3220) int ignorables;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3221) int tests = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3222) int failures = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3223)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3224) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3225) printf("Parsing %s\n", test_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3226) /* Step one, read data from file. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3227) file = fopen(test_name, "r");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3228) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3229) open_fail(test_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3230)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3231) while (fgets(line, LINESIZE, file)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3232) ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3233) buf0, buf1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3234) if (ret != 2 || *line == '#')
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3235) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3236) s = buf0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3237) t = buf2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3238) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3239) unichar = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3240) t += utf8encode(t, unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3241) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3242) *t = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3244) ignorables = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3245) s = buf1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3246) t = buf3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3247) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3248) unichar = strtoul(s, &s, 16);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3249) data = &unicode_data[unichar];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3250) if (data->utf8nfdi && !*data->utf8nfdi)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3251) ignorables = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3252) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3253) t += utf8encode(t, unichar);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3254) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3255) *t = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3256)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3257) tests++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3258) if (normalize_line(nfdi_tree) < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3259) printf("Line %s -> %s", buf0, buf1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3260) if (ignorables)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3261) printf(" (ignorables removed)");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3262) printf(" failure\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3263) failures++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3265) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3266) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3267) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3268) printf("Ran %d tests with %d failures\n", tests, failures);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3269) if (failures)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3270) file_fail(test_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3271) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3273) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3275) static void write_file(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3277) FILE *file;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3278) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3279) int j;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3280) int t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3281) int gen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3283) if (verbose > 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3284) printf("Writing %s\n", utf8_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3285) file = fopen(utf8_name, "w");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3286) if (!file)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3287) open_fail(utf8_name, errno);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3289) fprintf(file, "/* This file is generated code, do not edit. */\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3290) fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3291) fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3292) fprintf(file, "#endif\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3293) fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3294) fprintf(file, "static const unsigned int utf8vers = %#x;\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3295) unicode_maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3296) fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3297) fprintf(file, "static const unsigned int utf8agetab[] = {\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3298) for (i = 0; i != ages_count; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3299) fprintf(file, "\t%#x%s\n", ages[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3300) ages[i] == unicode_maxage ? "" : ",");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3301) fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3302) fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3303) fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3304) t = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3305) for (gen = 0; gen < ages_count; gen++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3306) fprintf(file, "\t{ %#x, %d }%s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3307) ages[gen], trees[t].index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3308) ages[gen] == unicode_maxage ? "" : ",");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3309) if (trees[t].maxage == ages[gen])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3310) t += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3311) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3312) fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3313) fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3314) fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3315) t = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3316) for (gen = 0; gen < ages_count; gen++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3317) fprintf(file, "\t{ %#x, %d }%s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3318) ages[gen], trees[t].index,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3319) ages[gen] == unicode_maxage ? "" : ",");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3320) if (trees[t].maxage == ages[gen])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3321) t += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3322) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3323) fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3324) fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3325) fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3326) utf8data_size);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3327) t = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3328) for (i = 0; i != utf8data_size; i += 16) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3329) if (i == trees[t].index) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3330) fprintf(file, "\t/* %s_%x */\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3331) trees[t].type, trees[t].maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3332) if (t < trees_count-1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3333) t++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3334) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3335) fprintf(file, "\t");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3336) for (j = i; j != i + 16; j++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3337) fprintf(file, "0x%.2x%s", utf8data[j],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3338) (j < utf8data_size -1 ? "," : ""));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3339) fprintf(file, "\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3341) fprintf(file, "};\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3342) fclose(file);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3343) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3344)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3345) /* ------------------------------------------------------------------ */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3346)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3347) int main(int argc, char *argv[])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3348) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3349) unsigned int unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3350) int opt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3352) argv0 = argv[0];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3353)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3354) while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3355) switch (opt) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3356) case 'a':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3357) age_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3358) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3359) case 'c':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3360) ccc_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3361) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3362) case 'd':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3363) data_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3364) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3365) case 'f':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3366) fold_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3367) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3368) case 'n':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3369) norm_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3370) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3371) case 'o':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3372) utf8_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3373) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3374) case 'p':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3375) prop_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3376) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3377) case 't':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3378) test_name = optarg;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3379) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3380) case 'v':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3381) verbose++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3382) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3383) case 'h':
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3384) help();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3385) exit(0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3386) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3387) usage();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3388) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3389) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3390)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3391) if (verbose > 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3392) help();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3393) for (unichar = 0; unichar != 0x110000; unichar++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3394) unicode_data[unichar].code = unichar;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3395) age_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3396) ccc_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3397) nfdi_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3398) nfdicf_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3399) ignore_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3400) corrections_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3401) hangul_decompose();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3402) nfdi_decompose();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3403) nfdicf_decompose();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3404) utf8_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3405) trees_init();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3406) trees_populate();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3407) trees_reduce();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3408) trees_verify();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3409) /* Prevent "unused function" warning. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3410) (void)lookup(nfdi_tree, " ");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3411) if (verbose > 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3412) tree_walk(nfdi_tree);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3413) if (verbose > 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3414) tree_walk(nfdicf_tree);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3415) normalization_test();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3416) write_file();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3418) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3419) }