unicode.c   [plain text]


#ifndef lint
static char *rcsid = "$Id: unicode.c,v 1.1 2003/06/04 00:26:16 marka Exp $";
#endif

/*
 * Copyright (c) 2000,2001,2002 Japan Network Information Center.
 * All rights reserved.
 *  
 * By using this file, you agree to the terms and conditions set forth bellow.
 * 
 * 			LICENSE TERMS AND CONDITIONS 
 * 
 * The following License Terms and Conditions apply, unless a different
 * license is obtained from Japan Network Information Center ("JPNIC"),
 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
 * Chiyoda-ku, Tokyo 101-0047, Japan.
 * 
 * 1. Use, Modification and Redistribution (including distribution of any
 *    modified or derived work) in source and/or binary forms is permitted
 *    under this License Terms and Conditions.
 * 
 * 2. Redistribution of source code must retain the copyright notices as they
 *    appear in each source code file, this License Terms and Conditions.
 * 
 * 3. Redistribution in binary form must reproduce the Copyright Notice,
 *    this License Terms and Conditions, in the documentation and/or other
 *    materials provided with the distribution.  For the purposes of binary
 *    distribution the "Copyright Notice" refers to the following language:
 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
 * 
 * 4. The name of JPNIC may not be used to endorse or promote products
 *    derived from this Software without specific prior written approval of
 *    JPNIC.
 * 
 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 */

#include <config.h>

#include <stddef.h>
#include <stdlib.h>
#include <string.h>

#include <idn/result.h>
#include <idn/logmacro.h>
#include <idn/assert.h>
#include <idn/unicode.h>

#define UNICODE_CURRENT	"3.2.0"

#define UCS_MAX		0x10ffff
#define END_BIT		0x80000000

/*
 * Some constants for Hangul decomposition/composition.
 */
#define SBase		0xac00
#define LBase		0x1100
#define VBase		0x1161
#define TBase		0x11a7
#define LCount		19
#define VCount		21
#define TCount		28
#define SLast		(SBase + LCount * VCount * TCount)

/*
 * Symbol composition macro.
 */
#define compose_sym(a, b)		compose_symX(a, b)
#define compose_symX(a, b)		a ## b

struct composition {
	unsigned long c2;	/* 2nd character */
	unsigned long comp;	/* composed character */
};

#include "unicodedata_320.c"
#define VERSION v320
#include "unicode_template.c"
#undef VERSION

typedef int	(*unicode_canonclassproc)(unsigned long v);
typedef int	(*unicode_decomposeproc)(unsigned long c,
					 const unsigned long **seqp);
typedef int	(*unicode_composeproc)(unsigned long c,
				       const struct composition **compp);

static struct idn__unicode_ops {
	char *version;
	unicode_canonclassproc canonclass_proc;
	unicode_decomposeproc decompose_proc;
	unicode_composeproc compose_proc;
} unicode_versions[] = {
#define MAKE_UNICODE_HANDLE(version, suffix) \
	{ version, \
	  compose_sym(canonclass_, suffix), \
	  compose_sym(decompose_, suffix), \
	  compose_sym(compose_, suffix) }
	MAKE_UNICODE_HANDLE("3.2.0", v320),
	{ NULL },
#undef MAKE_UNICODE_HANDLE
};
	
idn_result_t
idn__unicode_create(const char *version,
		    idn__unicode_version_t *versionp) {
	idn__unicode_version_t v;

	assert(versionp != NULL);
	TRACE(("idn__unicode_create(version=%-.50s)\n",
	       version == NULL ? "<NULL>" : version));

	if (version == NULL)
		version = UNICODE_CURRENT;

	for (v = unicode_versions; v->version != NULL; v++) {
		if (strcmp(v->version, version) == 0) {
			*versionp = v;
			return (idn_success);
		}
	}
	return (idn_notfound);
}

void
idn__unicode_destroy(idn__unicode_version_t version) {
	assert(version != NULL);
	TRACE(("idn__unicode_destroy()\n"));
	/* Nothing to do */
}

int
idn__unicode_canonicalclass(idn__unicode_version_t version, unsigned long c) {
	if (c > UCS_MAX)
		return (0);

	return (*version->canonclass_proc)(c);
}

idn_result_t
idn__unicode_decompose(idn__unicode_version_t version,
		       int compat, unsigned long *v, size_t vlen,
		       unsigned long c, int *decomp_lenp) {
	unsigned long *vorg = v;
	int seqidx;
	const unsigned long *seq;

	assert(v != NULL && vlen >= 0 && decomp_lenp != NULL);

	if (c > UCS_MAX)
		return (idn_notfound);

	/*
	 * First, check for Hangul.
	 */
	if (SBase <= c && c < SLast) {
		int idx, t_offset, v_offset, l_offset;

		idx = c - SBase;
		t_offset = idx % TCount;
		idx /= TCount;
		v_offset = idx % VCount;
		l_offset = idx / VCount;
		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
			return (idn_buffer_overflow);
		*v++ = LBase + l_offset;
		*v++ = VBase + v_offset;
		if (t_offset > 0)
			*v++ = TBase + t_offset;
		*decomp_lenp = v - vorg;
		return (idn_success);
	}

	/*
	 * Look up decomposition table.  If no decomposition is defined
	 * or if it is a compatibility decomosition when canonical
	 * decomposition requested, return 'idn_notfound'.
	 */
	seqidx = (*version->decompose_proc)(c, &seq);
	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
		return (idn_notfound);
	
	/*
	 * Copy the decomposed sequence.  The end of the sequence are
	 * marked with END_BIT.
	 */
	do {
		unsigned long c;
		int dlen;
		idn_result_t r;

		c = *seq & ~END_BIT;

		/* Decompose recursively. */
		r = idn__unicode_decompose(version, compat, v, vlen, c, &dlen);
		if (r == idn_success) {
			v += dlen;
			vlen -= dlen;
		} else if (r == idn_notfound) {
			if (vlen < 1)
				return (idn_buffer_overflow);
			*v++ = c;
			vlen--;
		} else {
			return (r);
		}

	} while ((*seq++ & END_BIT) == 0);
	
	*decomp_lenp = v - vorg;

	return (idn_success);
}

int
idn__unicode_iscompositecandidate(idn__unicode_version_t version,
				  unsigned long c) {
	const struct composition *dummy;

	if (c > UCS_MAX)
		return (0);

	/* Check for Hangul */
	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
		return (1);

	/*
	 * Look up composition table.  If there are no composition
	 * that begins with the given character, it is not a
	 * composition candidate.
	 */
	if ((*version->compose_proc)(c, &dummy) == 0)
		return (0);
	else
		return (1);
}

idn_result_t
idn__unicode_compose(idn__unicode_version_t version, unsigned long c1,
		     unsigned long c2, unsigned long *compp) {
	int n;
	int lo, hi;
	const struct composition *cseq;

	assert(compp != NULL);

	if (c1 > UCS_MAX || c2 > UCS_MAX)
		return (idn_notfound);

	/*
	 * Check for Hangul.
	 */
	if (LBase <= c1 && c1 < LBase + LCount &&
	    VBase <= c2 && c2 < VBase + VCount) {
		/*
		 * Hangul L and V.
		 */
		*compp = SBase +
			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
		return (idn_success);
	} else if (SBase <= c1 && c1 < SLast &&
		   TBase <= c2 && c2 < TBase + TCount &&
		   (c1 - SBase) % TCount == 0) {
		/*
		 * Hangul LV and T.
		 */
		*compp = c1 + (c2 - TBase);
		return (idn_success);
	}

	/*
	 * Look up composition table.  If the result is 0, no composition
	 * is defined.  Otherwise, upper 16bits of the result contains
	 * the number of composition that begins with 'c1', and the lower
	 * 16bits is the offset in 'compose_seq'.
	 */
	if ((n = (*version->compose_proc)(c1, &cseq)) == 0)
		return (idn_notfound);

	/*
	 * The composite sequences are sorted by the 2nd character 'c2'.
	 * So we can use binary search.
	 */
	lo = 0;
	hi = n - 1;
	while (lo <= hi) {
		int mid = (lo + hi) / 2;

		if (cseq[mid].c2 < c2) {
			lo = mid + 1;
		} else if (cseq[mid].c2 > c2) {
			hi = mid - 1;
		} else {
			*compp = cseq[mid].comp;
			return (idn_success);
		}
	}
	return (idn_notfound);
}