^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (c) 2014 SGI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #include "utf8n.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) struct utf8data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) unsigned int maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) unsigned int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #define __INCLUDED_FROM_UTF8NORM_C__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include "utf8data.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #undef __INCLUDED_FROM_UTF8NORM_C__
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) int utf8version_is_supported(u8 maj, u8 min, u8 rev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) int i = ARRAY_SIZE(utf8agetab) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) while (i >= 0 && utf8agetab[i] != 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) if (sb_utf8version == utf8agetab[i])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) i--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) EXPORT_SYMBOL(utf8version_is_supported);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) int utf8version_latest(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) return utf8vers;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) EXPORT_SYMBOL(utf8version_latest);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * UTF-8 valid ranges.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * The UTF-8 encoding spreads the bits of a 32bit word over several
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) * bytes. This table gives the ranges that can be held and how they'd
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) * be represented.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) * 0x00000000 0x0000007F: 0xxxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) * There is an additional requirement on UTF-8, in that only the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) * shortest representation of a 32bit value is to be used. A decoder
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) * must not decode sequences that do not satisfy this requirement.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * Thus the allowed ranges have a lower bound.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * 0x00000000 0x0000007F: 0xxxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * 17 planes of 65536 values. This limits the sequences actually seen
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * even more, to just the following.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) * 0 - 0x7F: 0 - 0x7F
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * Note that the longest sequence seen with valid usage is 4 bytes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) * the same a single UTF-32 character. This makes the UTF-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * representation of Unicode strictly smaller than UTF-32.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) * The shortest sequence requirement was introduced by:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * Corrigendum #1: UTF-8 Shortest Form
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) * It can be found here:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) * http://www.unicode.org/versions/corrigendum1.html
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) * Return the number of bytes used by the current UTF-8 sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) * Assumes the input points to the first byte of a valid UTF-8
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) * sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) static inline int utf8clen(const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) unsigned char c = *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) * Decode a 3-byte UTF-8 sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) static unsigned int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) utf8decode3(const char *str)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) unsigned int uc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) uc = *str++ & 0x0F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) uc <<= 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) uc |= *str++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) uc <<= 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) uc |= *str++ & 0x3F;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112) return uc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) * Encode a 3-byte UTF-8 sequence.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) static int
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) utf8encode3(char *str, unsigned int val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) str[2] = (val & 0x3F) | 0x80;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) val >>= 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) str[1] = (val & 0x3F) | 0x80;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) val >>= 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) str[0] = val | 0xE0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) return 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) * utf8trie_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * A compact binary tree, used to decode UTF-8 characters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * Internal nodes are one byte for the node itself, and up to three
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * bytes for an offset into the tree. The first byte contains the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * following information:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * NEXTBYTE - flag - advance to next byte if set
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * BITNUM - 3 bit field - the bit number to tested
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) * OFFLEN - 2 bit field - number of bytes in the offset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) * if offlen == 0 (non-branching node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) * RIGHTPATH - 1 bit field - set if the following node is for the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) * right-hand path (tested bit is set)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) * TRIENODE - 1 bit field - set if the following node is an internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) * node, otherwise it is a leaf node
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) * if offlen != 0 (branching node)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) * LEFTNODE - 1 bit field - set if the left-hand node is internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) * RIGHTNODE - 1 bit field - set if the right-hand node is internal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) * Due to the way utf8 works, there cannot be branching nodes with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) * NEXTBYTE set, and moreover those nodes always have a righthand
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) * descendant.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) typedef const unsigned char utf8trie_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) #define BITNUM 0x07
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) #define NEXTBYTE 0x08
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) #define OFFLEN 0x30
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) #define OFFLEN_SHIFT 4
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) #define RIGHTPATH 0x40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) #define TRIENODE 0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) #define RIGHTNODE 0x40
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) #define LEFTNODE 0x80
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) * utf8leaf_t
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) * The leaves of the trie are embedded in the trie, and so the same
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * underlying datatype: unsigned char.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * leaf[0]: The unicode version, stored as a generation number that is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * an index into utf8agetab[]. With this we can filter code
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * points based on the unicode version in which they were
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) * defined. The CCC of a non-defined code point is 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * leaf[1]: Canonical Combining Class. During normalization, we need
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * to do a stable sort into ascending order of all characters
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) * with a non-zero CCC that occur between two characters with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * a CCC of 0, or at the begin or end of a string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * The unicode standard guarantees that all CCC values are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * between 0 and 254 inclusive, which leaves 255 available as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * a special value.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) * Code points with CCC 0 are known as stoppers.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) * start of a NUL-terminated string that is the decomposition
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) * of the character.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) * The CCC of a decomposable character is the same as the CCC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * of the first character of its decomposition.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) * Some characters decompose as the empty string: these are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) * characters with the Default_Ignorable_Code_Point property.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) * These do affect normalization, as they all have CCC 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) * The decompositions in the trie have been fully expanded, with the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) * exception of Hangul syllables, which are decomposed algorithmically.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) * Casefolding, if applicable, is also done using decompositions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) * The trie is constructed in such a way that leaves exist for all
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) * UTF-8 sequences that match the criteria from the "UTF-8 valid
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) * ranges" comment above, and only for those sequences. Therefore a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) * lookup in the trie can be used to validate the UTF-8 input.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) typedef const unsigned char utf8leaf_t;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) #define LEAF_GEN(LEAF) ((LEAF)[0])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) #define LEAF_CCC(LEAF) ((LEAF)[1])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) #define MINCCC (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) #define MAXCCC (254)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) #define STOPPER (0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) #define DECOMPOSE (255)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) /* Marker for hangul syllable decomposition. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) #define HANGUL ((char)(255))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) /* Size of the synthesized leaf used for Hangul syllable decomposition. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) #define UTF8HANGULLEAF (12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) * SBase = 0xAC00
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) * LBase = 0x1100
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) * VBase = 0x1161
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) * TBase = 0x11A7
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) * LCount = 19
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) * VCount = 21
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) * TCount = 28
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) * NCount = 588 (VCount * TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) * SCount = 11172 (LCount * NCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) * Decomposition:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) * SIndex = s - SBase
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) * LV (Canonical/Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) * LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) * VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) * LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) * VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) * LVT (Canonical)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) * LVIndex = (SIndex / TCount) * TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) * TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) * LVPart = SBase + LVIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) * TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) * LVT (Full)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) * LIndex = SIndex / NCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) * VIndex = (Sindex % NCount) / TCount
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) * TIndex = (Sindex % TCount)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) * LPart = LBase + LIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * VPart = VBase + VIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) * if (TIndex == 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) * d = <LPart, VPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) * } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) * TPart = TBase + TIndex
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) * d = <LPart, TPart, VPart>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) * }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) /* Constants */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) #define SB (0xAC00)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) #define LB (0x1100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) #define VB (0x1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) #define TB (0x11A7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) #define LC (19)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) #define VC (21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) #define TC (28)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) #define NC (VC * TC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) #define SC (LC * NC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) /* Algorithmic decomposition of hangul syllable. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) static utf8leaf_t *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) utf8hangul(const char *str, unsigned char *hangul)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) unsigned int si;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) unsigned int li;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) unsigned int vi;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) unsigned int ti;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) unsigned char *h;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) /* Calculate the SI, LI, VI, and TI values. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) si = utf8decode3(str) - SB;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) li = si / NC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) vi = (si % NC) / TC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) ti = si % TC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) /* Fill in base of leaf. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) h = hangul;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) LEAF_GEN(h) = 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) LEAF_CCC(h) = DECOMPOSE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) h += 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) /* Add LPart, a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) h += utf8encode3((char *)h, li + LB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) /* Add VPart, a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) h += utf8encode3((char *)h, vi + VB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) /* Add TPart if required, also a 3-byte UTF-8 sequence. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) if (ti)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) h += utf8encode3((char *)h, ti + TB);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) /* Terminate string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) h[0] = '\0';
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) return hangul;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) * Use trie to scan s, touching at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) * Returns the leaf if one exists, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) * A non-NULL return guarantees that the UTF-8 sequence starting at s
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) * is well-formed and corresponds to a known unicode code point. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) * shorthand for this will be "is valid UTF-8 unicode".
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) static utf8leaf_t *utf8nlookup(const struct utf8data *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) unsigned char *hangul, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) utf8trie_t *trie = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) int offlen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) int offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) int mask;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) int node;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) if (len == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) trie = utf8data + data->offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) node = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) while (node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) if (*trie & NEXTBYTE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) if (--len == 0)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) mask = 1 << (*trie & BITNUM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) if (*s & mask) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) /* Right leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) if (offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) /* Right node at offset of trie */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) node = (*trie & RIGHTNODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) offset = trie[offlen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) while (--offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) offset <<= 8;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) offset |= trie[offlen];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) trie += offset;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) } else if (*trie & RIGHTPATH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) /* Right node after this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) node = (*trie & TRIENODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) trie++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359) /* No right node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) /* Left leg */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) if (offlen) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) /* Left node after this node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) node = (*trie & LEFTNODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) trie += offlen + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) } else if (*trie & RIGHTPATH) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) /* No left node. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) /* Left node after this node */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) node = (*trie & TRIENODE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) trie++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) * Hangul decomposition is done algorithmically. These are the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) * always 3 bytes long, so s has been advanced twice, and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) * start of the sequence is at s-2.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) trie = utf8hangul(s - 2, hangul);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) return trie;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * Use trie to scan s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * Returns the leaf if one exists, NULL otherwise.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) * Forwards to utf8nlookup().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) static utf8leaf_t *utf8lookup(const struct utf8data *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) unsigned char *hangul, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) return utf8nlookup(data, hangul, s, (size_t)-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) * Maximum age of any character in s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) * Return 0 if only non-assigned code points are used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) int utf8agemax(const struct utf8data *data, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) int age = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) leaf = utf8lookup(data, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) leaf_age = utf8agetab[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422) if (leaf_age <= data->maxage && leaf_age > age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) EXPORT_SYMBOL(utf8agemax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) * Minimum age of any character in s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) * Return 0 if non-assigned code points are used.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) int utf8agemin(const struct utf8data *data, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) int age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) age = data->maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) leaf = utf8lookup(data, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) leaf_age = utf8agetab[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) if (leaf_age <= data->maxage && leaf_age < age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) EXPORT_SYMBOL(utf8agemin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) * Maximum age of any character in s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) int age = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) leaf = utf8nlookup(data, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) leaf_age = utf8agetab[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) if (leaf_age <= data->maxage && leaf_age > age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) EXPORT_SYMBOL(utf8nagemax);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) * Maximum age of any character in s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) int leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) int age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) age = data->maxage;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) leaf = utf8nlookup(data, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) leaf_age = utf8agetab[LEAF_GEN(leaf)];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) if (leaf_age <= data->maxage && leaf_age < age)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) age = leaf_age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) return age;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) EXPORT_SYMBOL(utf8nagemin);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) * Length of the normalization of s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) * A string of Default_Ignorable_Code_Point has length 0.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) ssize_t utf8len(const struct utf8data *data, const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) size_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) while (*s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) leaf = utf8lookup(data, hangul, s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) else if (LEAF_CCC(leaf) == DECOMPOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) ret += strlen(LEAF_STR(leaf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) EXPORT_SYMBOL(utf8len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) * Length of the normalization of s, touch at most len bytes.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) * Return -1 if s is not valid UTF-8 unicode.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) size_t ret = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) while (len && *s) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) leaf = utf8nlookup(data, hangul, s, len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) else if (LEAF_CCC(leaf) == DECOMPOSE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) ret += strlen(LEAF_STR(leaf));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) ret += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) len -= utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) s += utf8clen(s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) EXPORT_SYMBOL(utf8nlen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) * Set up an utf8cursor for use by utf8byte().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) * u8c : pointer to cursor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) * data : const struct utf8data to use for normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) * s : string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579) * len : length of s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) * Returns -1 on error, 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) const char *s, size_t len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586) if (!data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) if (!s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) u8c->data = data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) u8c->s = s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) u8c->p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) u8c->ss = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) u8c->sp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) u8c->len = len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) u8c->slen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) u8c->ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) u8c->nccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) /* Check we didn't clobber the maximum length. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) if (u8c->len != len)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) /* The first byte of s may not be an utf8 continuation. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) if (len > 0 && (*s & 0xC0) == 0x80)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) EXPORT_SYMBOL(utf8ncursor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) * Set up an utf8cursor for use by utf8byte().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) * u8c : pointer to cursor.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) * data : const struct utf8data to use for normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) * s : NUL-terminated string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) * Returns -1 on error, 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) const char *s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) return utf8ncursor(u8c, data, s, (unsigned int)-1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) EXPORT_SYMBOL(utf8cursor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) * Get one byte from the normalized form of the string described by u8c.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) * Returns the byte cast to an unsigned char on succes, and -1 on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) * The cursor keeps track of the location in the string in u8c->s.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) * When a character is decomposed, the current location is stored in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) * u8c->p, and u8c->s is set to the start of the decomposition. Note
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) * that bytes from a decomposition do not count against u8c->len.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) * Characters are emitted if they match the current CCC in u8c->ccc.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) * and the function returns 0 in that case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) * Sorting by CCC is done by repeatedly scanning the string. The
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) * the start of the scan. The first pass finds the lowest CCC to be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) * emitted and stores it in u8c->nccc, the second pass emits the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) * characters with this CCC and finds the next lowest CCC. This limits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) * the number of passes to 1 + the number of different CCCs in the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * sequence being scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) * Therefore:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) * u8c->p != NULL -> a decomposition is being scanned.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) * u8c->ss != NULL -> this is a repeating scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) * u8c->ccc == -1 -> this is the first scan of a repeating scan.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) int utf8byte(struct utf8cursor *u8c)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) utf8leaf_t *leaf;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) for (;;) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) /* Check for the end of a decomposed character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) if (u8c->p && *u8c->s == '\0') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) u8c->s = u8c->p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) u8c->p = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) /* Check for end-of-string. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) /* There is no next byte. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) if (u8c->ccc == STOPPER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) /* End-of-string during a scan counts as a stopper. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) goto ccc_mismatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) } else if ((*u8c->s & 0xC0) == 0x80) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) /* This is a continuation of the current character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) u8c->len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) return (unsigned char)*u8c->s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) /* Look up the data for the current character. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) if (u8c->p) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) leaf = utf8nlookup(u8c->data, u8c->hangul,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) u8c->s, u8c->len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) /* No leaf found implies that the input is a binary blob. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) ccc = LEAF_CCC(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) /* Characters that are too new have CCC 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) } else if (ccc == DECOMPOSE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) u8c->p = u8c->s + utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) u8c->s = LEAF_STR(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) /* Empty decomposition implies CCC 0. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) if (*u8c->s == '\0') {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) if (u8c->ccc == STOPPER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) goto ccc_mismatch;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) if (!leaf)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) return -1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) ccc = LEAF_CCC(leaf);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) * If this is not a stopper, then see if it updates
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) * the next canonical class to be emitted.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) u8c->nccc = ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * Return the current byte if this is the current
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) * combining class.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) if (ccc == u8c->ccc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) u8c->len--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) return (unsigned char)*u8c->s++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) /* Current combining class mismatch. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) ccc_mismatch:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) if (u8c->nccc == STOPPER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) * Scan forward for the first canonical class
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) * to be emitted. Save the position from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) * which to restart.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) u8c->ccc = MINCCC - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) u8c->nccc = ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) u8c->sp = u8c->p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) u8c->ss = u8c->s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) u8c->slen = u8c->len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) u8c->s += utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) } else if (ccc != STOPPER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) /* Not a stopper, and not the ccc we're emitting. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) if (!u8c->p)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) u8c->len -= utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) u8c->s += utf8clen(u8c->s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) } else if (u8c->nccc != MAXCCC + 1) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) /* At a stopper, restart for next ccc. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) u8c->ccc = u8c->nccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) u8c->nccc = MAXCCC + 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) u8c->s = u8c->ss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) u8c->p = u8c->sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) u8c->len = u8c->slen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) /* All done, proceed from here. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) u8c->ccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) u8c->nccc = STOPPER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) u8c->sp = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) u8c->ss = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) u8c->slen = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) EXPORT_SYMBOL(utf8byte);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) const struct utf8data *utf8nfdi(unsigned int maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) int i = ARRAY_SIZE(utf8nfdidata) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) while (maxage < utf8nfdidata[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) i--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) if (maxage > utf8nfdidata[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) return &utf8nfdidata[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) EXPORT_SYMBOL(utf8nfdi);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) const struct utf8data *utf8nfdicf(unsigned int maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) while (maxage < utf8nfdicfdata[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) i--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) if (maxage > utf8nfdicfdata[i].maxage)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) return &utf8nfdicfdata[i];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792) EXPORT_SYMBOL(utf8nfdicf);