#include <apr_fnmatch.h>
#include "private/svn_string_private.h"
#include "private/svn_utf_private.h"
#include "svn_private_config.h"
#if SVN_INTERNAL_UTF8PROC
#define UTF8PROC_INLINE
#define strlen svn__strlen_var
#include "utf8proc/utf8proc.c"
#undef strlen
#else
#include <utf8proc.h>
#endif
const char *
svn_utf__utf8proc_compiled_version(void)
{
static const char utf8proc_version[] =
APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
return utf8proc_version;
}
const char *
svn_utf__utf8proc_runtime_version(void)
{
SVN_UNUSED(utf8proc_grapheme_break);
SVN_UNUSED(utf8proc_tolower);
SVN_UNUSED(utf8proc_toupper);
#if UTF8PROC_VERSION_MAJOR >= 2
SVN_UNUSED(utf8proc_totitle);
#endif
SVN_UNUSED(utf8proc_charwidth);
SVN_UNUSED(utf8proc_category_string);
SVN_UNUSED(utf8proc_NFD);
SVN_UNUSED(utf8proc_NFC);
SVN_UNUSED(utf8proc_NFKD);
SVN_UNUSED(utf8proc_NFKC);
return utf8proc_version();
}
static apr_ssize_t
unicode_decomposition(int transform_flags,
const char *string, apr_size_t length,
svn_membuf_t *buffer)
{
const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
? UTF8PROC_NULLTERM : 0);
for (;;)
{
apr_int32_t *const ucs4buf = buffer->data;
const apr_ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
const apr_ssize_t result =
utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
| transform_flags | nullterm);
if (result < 0 || result <= ucs4len)
return result;
svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
}
}
static svn_error_t *
decompose_normalized(apr_size_t *result_length,
const char *string, apr_size_t length,
svn_membuf_t *buffer)
{
apr_ssize_t result = unicode_decomposition(0, string, length, buffer);
if (result < 0)
return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
gettext(utf8proc_errmsg(result)));
*result_length = result;
return SVN_NO_ERROR;
}
static svn_error_t *
normalize_cstring(apr_size_t *result_length,
const char *string, apr_size_t length,
svn_boolean_t casefold,
svn_boolean_t stripmark,
svn_membuf_t *buffer)
{
int flags = 0;
apr_ssize_t result;
if (casefold)
flags |= UTF8PROC_CASEFOLD;
if (stripmark)
flags |= UTF8PROC_STRIPMARK;
result = unicode_decomposition(flags, string, length, buffer);
if (result >= 0)
{
svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
result = utf8proc_reencode(buffer->data, result,
UTF8PROC_COMPOSE | UTF8PROC_STABLE);
}
if (result < 0)
return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
gettext(utf8proc_errmsg(result)));
*result_length = result;
return SVN_NO_ERROR;
}
static int
ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
const apr_int32_t *bufb, apr_size_t lenb)
{
const apr_size_t len = (lena < lenb ? lena : lenb);
apr_size_t i;
for (i = 0; i < len; ++i)
{
const int diff = bufa[i] - bufb[i];
if (diff)
return diff;
}
return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
}
svn_error_t *
svn_utf__normcmp(int *result,
const char *str1, apr_size_t len1,
const char *str2, apr_size_t len2,
svn_membuf_t *buf1, svn_membuf_t *buf2)
{
apr_size_t buflen1;
apr_size_t buflen2;
const svn_boolean_t empty1 =
(0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
const svn_boolean_t empty2 =
(0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
if (empty1 || empty2)
{
*result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
return SVN_NO_ERROR;
}
SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
*result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
return SVN_NO_ERROR;
}
svn_error_t*
svn_utf__normalize(const char **result,
const char *str, apr_size_t len,
svn_membuf_t *buf)
{
apr_size_t result_length;
SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
*result = (const char*)(buf->data);
return SVN_NO_ERROR;
}
svn_error_t *
svn_utf__xfrm(const char **result,
const char *str, apr_size_t len,
svn_boolean_t case_insensitive,
svn_boolean_t accent_insensitive,
svn_membuf_t *buf)
{
apr_size_t result_length;
SVN_ERR(normalize_cstring(&result_length, str, len,
case_insensitive, accent_insensitive, buf));
*result = (const char*)(buf->data);
return SVN_NO_ERROR;
}
svn_boolean_t
svn_utf__fuzzy_glob_match(const char *str,
const apr_array_header_t *patterns,
svn_membuf_t *buf)
{
const char *normalized;
svn_error_t *err;
int i;
err = svn_utf__xfrm(&normalized, str, strlen(str), TRUE, TRUE, buf);
if (err)
{
svn_error_clear(err);
return FALSE;
}
for (i = 0; i < patterns->nelts; ++i)
{
const char *pattern = APR_ARRAY_IDX(patterns, i, const char *);
if (apr_fnmatch(pattern, normalized, 0) == APR_SUCCESS)
return TRUE;
}
return FALSE;
}
static svn_error_t *
encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
{
apr_size_t utf8len;
if (buffer->size - *length < 4)
svn_membuf__resize(buffer, buffer->size + 4);
utf8len = utf8proc_encode_char(ucs4chr, ((apr_byte_t*)buffer->data + *length));
if (!utf8len)
return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
_("Invalid Unicode character U+%04lX"),
(long)ucs4chr);
*length += utf8len;
return SVN_NO_ERROR;
}
svn_error_t *
svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
const apr_int32_t *ucs4str,
apr_size_t length,
apr_size_t *result_length)
{
*result_length = 0;
while (length-- > 0)
SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
svn_membuf__resize(buffer, *result_length + 1);
((char*)buffer->data)[*result_length] = '\0';
return SVN_NO_ERROR;
}
svn_error_t *
svn_utf__glob(svn_boolean_t *match,
const char *pattern, apr_size_t pattern_len,
const char *string, apr_size_t string_len,
const char *escape, apr_size_t escape_len,
svn_boolean_t sql_like,
svn_membuf_t *pattern_buf,
svn_membuf_t *string_buf,
svn_membuf_t *temp_buf)
{
apr_size_t patternbuf_len;
apr_size_t tempbuf_len;
if (escape && !sql_like)
return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
_("Cannot use a custom escape token"
" in glob matching mode"));
SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
if (!sql_like)
SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
tempbuf_len, &patternbuf_len));
else
{
const apr_int32_t *like = temp_buf->data;
apr_int32_t ucs4esc;
svn_boolean_t escaped;
apr_size_t i;
if (!escape)
ucs4esc = -1;
else
{
const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
? UTF8PROC_NULLTERM : 0);
apr_ssize_t result =
utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
if (result < 0)
return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
gettext(utf8proc_errmsg(result)));
if (result == 0 || result > 1)
return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
_("Escape token must be one character"));
if ((ucs4esc & 0xFF) != ucs4esc)
return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
_("Invalid escape character U+%04lX"),
(long)ucs4esc);
}
patternbuf_len = 0;
svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
{
if (*like == ucs4esc && !escaped)
{
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
((char*)pattern_buf->data)[patternbuf_len++] = '\\';
escaped = TRUE;
}
else if (escaped)
{
SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
escaped = FALSE;
}
else
{
if ((*like == '[' || *like == '\\') && !escaped)
{
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
((char*)pattern_buf->data)[patternbuf_len++] = '\\';
escaped = TRUE;
--i; --like;
continue;
}
if (*like == '%' || *like == '_')
{
const char wildcard = (*like == '%' ? '*' : '?');
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
}
else
SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
}
}
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
((char*)pattern_buf->data)[patternbuf_len] = '\0';
}
SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
tempbuf_len, &tempbuf_len));
*match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
return SVN_NO_ERROR;
}
svn_boolean_t
svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
{
svn_error_t *err;
svn_membuf_t buffer;
apr_size_t result_length;
const apr_size_t length = strlen(string);
svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
err = normalize_cstring(&result_length, string, length,
FALSE, FALSE, &buffer);
if (err)
{
svn_error_clear(err);
return FALSE;
}
return (length == result_length && 0 == strcmp(string, buffer.data));
}
const char *
svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
{
static const char digits[] = "0123456789ABCDEF";
static const int decomp_flags = (
UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
| UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
svn_stringbuf_t *result;
svn_membuf_t buffer;
apr_ssize_t decomp_length;
apr_ssize_t len;
svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
if (decomp_length < 0)
{
svn_membuf_t part;
apr_size_t done, prev;
SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
svn_membuf__create(&part, sizeof(apr_int32_t), pool);
decomp_length = 0;
done = prev = 0;
while (done < length)
{
apr_int32_t uc;
while (done < length)
{
len = utf8proc_iterate((apr_byte_t*)src + done, length - done, &uc);
if (len < 0)
break;
done += len;
}
if (done > prev)
{
len = unicode_decomposition(
decomp_flags, src + prev, done - prev, &part);
SVN_ERR_ASSERT_NO_RETURN(len > 0);
svn_membuf__resize(
&buffer, (decomp_length + len) * sizeof(apr_int32_t));
memcpy((apr_int32_t*)buffer.data + decomp_length,
part.data, len * sizeof(apr_int32_t));
decomp_length += len;
prev = done;
}
if (done < length)
{
const char *last;
const char *const p = src + done;
len = utf8proc_utf8class[(apr_byte_t)*p];
if (len > 1 && len <= (apr_ssize_t)(length - done))
last = svn_utf__last_valid(p, len);
else
last = NULL;
if (!last || (last && last - p < len))
{
uc = -((apr_int32_t)(*p & 0xff));
len = 1;
}
else
{
switch (len)
{
case 2:
uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
break;
case 3:
uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ (p[2] & 0x3f));
break;
case 4:
uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
break;
default:
SVN_ERR_ASSERT_NO_RETURN(
!"Unexpected invalid UTF-8 byte");
}
}
svn_membuf__resize(
&buffer, (decomp_length + 1) * sizeof(apr_int32_t));
((apr_int32_t*)buffer.data)[decomp_length++] = uc;
done += len;
prev = done;
}
}
}
result = svn_stringbuf_create_ensure(decomp_length, pool);
for (len = 0; len < decomp_length; ++len)
{
const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
if (cp > 0 && cp < 127)
svn_stringbuf_appendbyte(result, (char)cp);
else if (cp == 0)
svn_stringbuf_appendcstr(result, "\\0");
else if (cp < 0)
{
const apr_int32_t rcp = ((-cp) & 0xff);
svn_stringbuf_appendcstr(result, "?\\");
svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
}
else
{
if (utf8proc_codepoint_valid(cp))
{
const utf8proc_property_t *prop = utf8proc_get_property(cp);
if (prop->combining_class != 0)
continue;
svn_stringbuf_appendcstr(result, "{U+");
}
else
svn_stringbuf_appendcstr(result, "{U?");
if (cp > 0xffff)
{
svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
}
svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
svn_stringbuf_appendbyte(result, '}');
}
}
return result->data;
}