^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) /* SPDX-License-Identifier: GPL-2.0-only */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * Copyright (c) 2014 SGI.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * All rights reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) #ifndef UTF8NORM_H
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #define UTF8NORM_H
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/types.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/export.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/string.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) /* Encoding a unicode version number as a single unsigned int. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #define UNICODE_MAJ_SHIFT (16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define UNICODE_MIN_SHIFT (8)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #define UNICODE_AGE(MAJ, MIN, REV) \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) ((unsigned int)(REV)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) /* Highest unicode version supported by the data tables. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26) extern int utf8version_latest(void);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) * Look for the correct const struct utf8data for a unicode version.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) * Returns NULL if the version requested is too new.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) * Two normalization forms are supported: nfdi and nfdicf.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) * nfdi:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) * - Apply unicode normalization form NFD.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) * - Remove any Default_Ignorable_Code_Point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) * nfdicf:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) * - Apply unicode normalization form NFD.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) * - Remove any Default_Ignorable_Code_Point.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) * - Apply a full casefold (C + F).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) extern const struct utf8data *utf8nfdi(unsigned int maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) extern const struct utf8data *utf8nfdicf(unsigned int maxage);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47) * Determine the maximum age of any unicode character in the string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) * Returns 0 if only unassigned code points are present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) * Returns -1 if the input is not valid UTF-8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) extern int utf8agemax(const struct utf8data *data, const char *s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) * Determine the minimum age of any unicode character in the string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) * Returns 0 if any unassigned code points are present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) * Returns -1 if the input is not valid UTF-8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) extern int utf8agemin(const struct utf8data *data, const char *s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) * Determine the length of the normalized from of the string,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) * excluding any terminating NULL byte.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) * Returns 0 if only ignorable code points are present.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) * Returns -1 if the input is not valid UTF-8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68) extern ssize_t utf8len(const struct utf8data *data, const char *s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) /* Needed in struct utf8cursor below. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) #define UTF8HANGULLEAF (12)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * Cursor structure used by the normalizer.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) struct utf8cursor {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) const struct utf8data *data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) const char *s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) const char *p;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) const char *ss;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) const char *sp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) unsigned int len;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) unsigned int slen;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) short int ccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) short int nccc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) unsigned char hangul[UTF8HANGULLEAF];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) * Initialize a utf8cursor to normalize a string.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92) * Returns 0 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) * Returns -1 on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95) extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) const char *s);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) const char *s, size_t len);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * Get the next byte in the normalization.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * Returns a value > 0 && < 256 on success.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * Returns 0 when the end of the normalization is reached.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) * Returns -1 if the string being normalized is not valid UTF-8.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) extern int utf8byte(struct utf8cursor *u8c);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) #endif /* UTF8NORM_H */