lf_hfs_sbunicode.h [plain text]

/* Copyright © 2017-2018 Apple Inc. All rights reserved.
 *
 *  lf_hfs_sbunicode.h
 *  livefiles_hfs
 *
 *  Created by Oded Shoshani on 31/1/18.
 */

#ifndef lf_hfs_sbunicode_h
#define lf_hfs_sbunicode_h

/*
 Includes Unicode 3.2 decomposition code derived from Core Foundation
 */

/*
 * UTF-8 (Unicode Transformation Format)
 *
 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
 * character as a sequence of one to four bytes. Only the shortest form
 * required to represent the significant Unicode bits is legal.
 *
 * UTF-8 Multibyte Codes
 *
 * Bytes   Bits   Unicode Min  Unicode Max   UTF-8 Byte Sequence (binary)
 * -----------------------------------------------------------------------------
 *   1       7       0x0000        0x007F    0xxxxxxx
 *   2      11       0x0080        0x07FF    110xxxxx 10xxxxxx
 *   3      16       0x0800        0xFFFF    1110xxxx 10xxxxxx 10xxxxxx
 *   4      21      0x10000      0x10FFFF    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 * -----------------------------------------------------------------------------
 */

/*
 * UTF-8 encode/decode flags
 */
#define UTF_REVERSE_ENDIAN      0x0001   /* reverse UCS-2 byte order */
#define UTF_ADD_NULL_TERM       0x0002   /* add null termination */
#define UTF_DECOMPOSED          0x0004   /* generate fully decomposed UCS-2 */
#define UTF_PRECOMPOSED         0x0008   /* generate precomposed UCS-2 */
#define UTF_ESCAPE_ILLEGAL      0x0010   /* escape illegal UTF-8 */
#define UTF_SFM_CONVERSIONS     0x0020   /* Use SFM mappings for illegal NTFS chars */

#define UTF_BIG_ENDIAN       \
((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
#define UTF_LITTLE_ENDIAN    \
((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)



/*
 * utf8_encodelen - Calculate the UTF-8 encoding length
 *
 * This function takes a Unicode input string, ucsp, of ucslen bytes
 * and calculates the size of the UTF-8 output in bytes (not including
 * a NULL termination byte). The string must reside in kernel memory.
 *
 * If '/' chars are possible in the Unicode input then an alternate
 * (replacement) char should be provided in altslash.
 *
 * FLAGS
 *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output
 *
 *    UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
 *
 * ERRORS
 *    None
 */
size_t utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags);

/*
 * utf8_encodestr - Encodes a Unicode string to UTF-8
 *
 * NOTES:
 *    The resulting UTF-8 string is NULL terminated.
 *
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output
 *
 *    UTF_ADD_NULL_TERM:  add NULL termination to UTF-8 output
 *
 * result:
 *    ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
 *
 *    EINVAL: Illegal char found; char was replaced by an '_'.
 */
extern int utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p, size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);

/*
 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
 *
 * NOTES:
 *    The input UTF-8 string does not need to be null terminated
 *    if utf8len is set.
 *
 *    If '/' chars are allowed on disk then an alternate
 *    (replacement) char must be provided in altslash.
 *
 * input flags:
 *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
 *
 *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
 *
 *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
 *
 *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
 *
 *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
 *
 *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
 *
 * result:
 *    ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
 *
 *    EINVAL: Illegal UTF-8 sequence found.
 */
int utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp, size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);

#endif /* lf_hfs_sbunicode_h */