utf8.c   [plain text]


#ifndef lint
static char *rcsid = "$Id: utf8.c,v 1.1 2003/06/04 00:26:44 marka Exp $";
#endif

/*
 * Copyright (c) 2000 Japan Network Information Center.  All rights reserved.
 *  
 * By using this file, you agree to the terms and conditions set forth bellow.
 * 
 * 			LICENSE TERMS AND CONDITIONS 
 * 
 * The following License Terms and Conditions apply, unless a different
 * license is obtained from Japan Network Information Center ("JPNIC"),
 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
 * Chiyoda-ku, Tokyo 101-0047, Japan.
 * 
 * 1. Use, Modification and Redistribution (including distribution of any
 *    modified or derived work) in source and/or binary forms is permitted
 *    under this License Terms and Conditions.
 * 
 * 2. Redistribution of source code must retain the copyright notices as they
 *    appear in each source code file, this License Terms and Conditions.
 * 
 * 3. Redistribution in binary form must reproduce the Copyright Notice,
 *    this License Terms and Conditions, in the documentation and/or other
 *    materials provided with the distribution.  For the purposes of binary
 *    distribution the "Copyright Notice" refers to the following language:
 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
 * 
 * 4. The name of JPNIC may not be used to endorse or promote products
 *    derived from this Software without specific prior written approval of
 *    JPNIC.
 * 
 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 */

#include <config.h>

#include <stddef.h>

#include <idn/assert.h>
#include <idn/logmacro.h>
#include <idn/utf8.h>
#include <idn/debug.h>

#define UTF8_WIDTH(c) \
	(((c) < 0x80) ? 1 : \
	 ((c) < 0xc0) ? 0 : \
	 ((c) < 0xe0) ? 2 : \
	 ((c) < 0xf0) ? 3 : \
	 ((c) < 0xf8) ? 4 : \
	 ((c) < 0xfc) ? 5 : \
	 ((c) < 0xfe) ? 6 : 0)

#define VALID_CONT_BYTE(c)	(0x80 <= (c) && (c) < 0xc0)

int
idn_utf8_mblen(const char *s) {
	int c = *(unsigned char *)s;

	assert(s != NULL);

#if 0
	TRACE(("idn_utf8_mblen(s=<%s>)\n", idn__debug_hexstring(s, 6)));
#endif

	return UTF8_WIDTH(c);
}

int
idn_utf8_getmb(const char *s, size_t len, char *buf) {
	/* buf must be at least 7-bytes long */
	const unsigned char *p = (const unsigned char *)s;
	unsigned char *q = (unsigned char *)buf;
	int width = UTF8_WIDTH(*p);
	int w;

	assert(s != NULL);

#if 0
	TRACE(("idn_utf8_getmb(s=<%s>,len=%d)\n",
	      idn__debug_hexstring(s, 6), len));
#endif

	if (width == 0 || len < width)
		return (0);

	/* Copy the first byte. */
	*q++ = *p++;

	/* .. and the rest. */
	w = width;
	while (--w > 0) {
		if (!VALID_CONT_BYTE(*p))
			return (0);
		*q++ = *p++;
	}
	return (width);
}

int
idn_utf8_getwc(const char *s, size_t len, unsigned long *vp) {
	unsigned long v;
	unsigned long min;
	const unsigned char *p = (const unsigned char *)s;
	int c;
	int width;
	int rest;

	assert(s != NULL);

#if 0
	TRACE(("idn_utf8_getwc(s=<%s>,len=%d)\n",
	      idn__debug_hexstring(s, 10), len));
#endif

	c = *p++;
	width = UTF8_WIDTH(c);

	switch (width) {
	case 0:
		return (0);
	case 1:
		v = c;
		min = 0;
		break;
	case 2:
		v = c & 0x1f;
		min = 0x80;
		break;
	case 3:
		v = c & 0xf;
		min = 0x800;
		break;
	case 4:
		v = c & 0x7;
		min = 0x10000;
		break;
	case 5:
		v = c & 3;
		min = 0x200000;
		break;
	case 6:
		v = c & 1;
		min = 0x4000000;
		break;
	default:
		FATAL(("idn_utf8_getint: internal error\n"));
		return (0);
	}

	if (len < width)
		return (0);
		
	rest = width - 1;
	while (rest-- > 0) {
		if (!VALID_CONT_BYTE(*p))
			return (0);
		v = (v << 6) | (*p & 0x3f);
		p++;
	}

	if (v < min)
		return (0);

	*vp = v;
	return (width);
}

int
idn_utf8_putwc(char *s, size_t len, unsigned long v) {
	unsigned char *p = (unsigned char *)s;
	int mask;
	int off;
	int l;

	assert(s != NULL);

#if 0
	TRACE(("idn_utf8_putwc(v=%lx)\n", v));
#endif

	if (v < 0x80) {
		mask = 0;
		l = 1;
	} else if (v < 0x800) {
		mask = 0xc0;
		l = 2;
	} else if (v < 0x10000) {
		mask = 0xe0;
		l = 3;
	} else if (v < 0x200000) {
		mask = 0xf0;
		l = 4;
	} else if (v < 0x4000000) {
		mask = 0xf8;
		l = 5;
	} else if (v < 0x80000000) {
		mask = 0xfc;
		l = 6;
	} else {
		return (0);
	}

	if (len < l)
		return (0);

	off = 6 * (l - 1);
	*p++ = (v >> off) | mask;
	mask = 0x80;
	while (off > 0) {
		off -= 6;
		*p++ = ((v >> off) & 0x3f) | mask;
	}
	return l;
}

int
idn_utf8_isvalidchar(const char *s) {
	unsigned long dummy;

	TRACE(("idn_utf8_isvalidchar(s=<%s>)\n",
	      idn__debug_hexstring(s, 6)));

	return (idn_utf8_getwc(s, 6, &dummy) > 0);
}

int
idn_utf8_isvalidstring(const char *s) {
	unsigned long dummy;
	int width;

	assert(s != NULL);

	TRACE(("idn_utf8_isvalidstring(s=<%s>)\n",
	      idn__debug_hexstring(s, 20)));

	while (*s != '\0') {
		width = idn_utf8_getwc(s, 6, &dummy);
		if (width == 0)
			return (0);
		s += width;
	}
	return (1);
}

char *
idn_utf8_findfirstbyte(const char *s, const char *known_top) {
	const unsigned char *p = (const unsigned char *)s;
	const unsigned char *t = (const unsigned char *)known_top;

	assert(s != NULL && known_top != NULL && known_top <= s);

	TRACE(("idn_utf8_findfirstbyte(s=<%s>)\n",
	      idn__debug_hexstring(s, 8)));

	while (p >= t) {
		if (!VALID_CONT_BYTE(*p))
		    break;
		p--;
	}
	if (p < t || UTF8_WIDTH(*p) == 0)
		return (NULL);

	return ((char *)p);
}