ucsmap.c   [plain text]


#ifndef lint
static char *rcsid = "$Id: ucsmap.c,v 1.1 2003/06/04 00:26:14 marka Exp $";
#endif

/*
 * Copyright (c) 2001 Japan Network Information Center.  All rights reserved.
 *  
 * By using this file, you agree to the terms and conditions set forth bellow.
 * 
 * 			LICENSE TERMS AND CONDITIONS 
 * 
 * The following License Terms and Conditions apply, unless a different
 * license is obtained from Japan Network Information Center ("JPNIC"),
 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
 * Chiyoda-ku, Tokyo 101-0047, Japan.
 * 
 * 1. Use, Modification and Redistribution (including distribution of any
 *    modified or derived work) in source and/or binary forms is permitted
 *    under this License Terms and Conditions.
 * 
 * 2. Redistribution of source code must retain the copyright notices as they
 *    appear in each source code file, this License Terms and Conditions.
 * 
 * 3. Redistribution in binary form must reproduce the Copyright Notice,
 *    this License Terms and Conditions, in the documentation and/or other
 *    materials provided with the distribution.  For the purposes of binary
 *    distribution the "Copyright Notice" refers to the following language:
 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
 * 
 * 4. The name of JPNIC may not be used to endorse or promote products
 *    derived from this Software without specific prior written approval of
 *    JPNIC.
 * 
 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
 */

#include <config.h>

#include <stdlib.h>
#include <string.h>

#include <idn/result.h>
#include <idn/assert.h>
#include <idn/log.h>
#include <idn/logmacro.h>
#include <idn/ucsmap.h>

#define INIT_SIZE		50
#define DEFAULT_BUF_SIZE	500
#define UCSMAP_HASH_SIZE	103
#define MAX_MAPLEN		0xffff

/*
 * This module implements UCS 1-to-N mapping.
 * To speed up mapping table lookup, a combination of hash and
 * binary search is used.
 */

/*
 * Mapping entry.
 * Entries are sorted by its hash index and code point.
 */
typedef struct {
	short hidx;		/* hash index */
	unsigned short len;	/* length of mapped sequence */
	unsigned long ucs;	/* code point to be mapped */
	unsigned long *map;	/* mapped sequence of code points */
} ucsmap_entry_t;

/*
 * Hash table entry.
 * Since the entries pointed by ucsmap_hash_t.entry are sorted,
 * binary search can be used.
 */
typedef struct {
	ucsmap_entry_t *entry;	/* sorted by code point */
	int n;			/* length of 'entry' */
} ucsmap_hash_t;

/*
 * UCS character buffer for storing target character sequence.
 */
typedef struct ucsmap_buf {
	struct ucsmap_buf *next;
	unsigned long buf[1];		/* actually a variable length array */
} ucsmap_buf_t;

/*
 * Mapping object.
 */
typedef struct idn_ucsmap {
	ucsmap_hash_t hash[UCSMAP_HASH_SIZE];
	ucsmap_entry_t *entries;	/* array of entries */
	size_t entry_size;		/* allocated size */
	size_t nentries;		/* # of entries in use */
	ucsmap_buf_t *mapdata;		/* list of character buffers */
	size_t mapdata_size;		/* allocated size of current buffer */
	size_t mapdata_used;		/* # of chars in use */
	int fixed;			/* already fixed? */
	int refcnt;			/* reference count */
} ucsmap_t;

static int		ucsmap_hash(unsigned long v);
static unsigned long	*save_mapped_sequence(idn_ucsmap_t ctx,
					      unsigned long *map,
					      size_t maplen);
static void		free_mapbuf(ucsmap_buf_t *buf);
static int		comp_entry(const void *v1, const void *v2);

idn_result_t
idn_ucsmap_create(idn_ucsmap_t *ctxp) {
	idn_ucsmap_t ctx;

	assert(ctxp != NULL);

	TRACE(("idn_ucsmap_create()\n"));

	if ((ctx = malloc(sizeof(*ctx))) == NULL) {
		WARNING(("idn_ucsmap_create: malloc failed\n"));
		return (idn_nomemory);
	}

	ctx->entry_size = 0;
	ctx->nentries = 0;
	ctx->entries = NULL;
	ctx->mapdata = NULL;
	ctx->mapdata_size = 0;
	ctx->mapdata_used = 0;
	ctx->fixed = 0;
	ctx->refcnt = 1;
	*ctxp = ctx;
	return (idn_success);
}

void
idn_ucsmap_destroy(idn_ucsmap_t ctx) {
	assert(ctx != NULL && ctx->refcnt > 0);

	TRACE(("idn_ucsmap_destroy()\n"));

	if (--ctx->refcnt == 0) {
		if (ctx->entries != NULL)
			free(ctx->entries);
		if (ctx->mapdata != NULL)
			free_mapbuf(ctx->mapdata);
		free(ctx);
	}
}

void
idn_ucsmap_incrref(idn_ucsmap_t ctx) {
	assert(ctx != NULL && ctx->refcnt > 0);

	ctx->refcnt++;
}

idn_result_t
idn_ucsmap_add(idn_ucsmap_t ctx, unsigned long ucs,
	       unsigned long *map, size_t maplen)
{
	ucsmap_entry_t *e;
	ucsmap_entry_t *newbuf;

	assert(ctx != NULL && ctx->refcnt > 0);

	TRACE(("idn_ucsmap_add(ucs=U+%lX, maplen=%u)\n", ucs, maplen));

	/* Make sure it is not fixed yet. */
	if (ctx->fixed) {
		WARNING(("idn_ucsmap_add: attempt to add to fixed map\n"));
		return (idn_failure);
	}

	if (maplen > MAX_MAPLEN) {
		WARNING(("idn_ucsmap_add: maplen too large (> %d)\n",
			 MAX_MAPLEN));
		return (idn_failure);
	}

	/* Append an entry. */
	if (ctx->nentries >= ctx->entry_size) {
		if (ctx->entry_size == 0)
			ctx->entry_size = INIT_SIZE;
		else
			ctx->entry_size *= 2;
		newbuf = realloc(ctx->entries, sizeof(*e) * ctx->entry_size);
		if (newbuf == NULL)
			return (idn_nomemory);
		ctx->entries = newbuf;
	}
	e = &ctx->entries[ctx->nentries];
	e->hidx = ucsmap_hash(ucs);
	e->len = maplen;
	e->ucs = ucs;
	if (maplen > 0) {
		/* Save mapped sequence in the buffer. */
		e->map = save_mapped_sequence(ctx, map, maplen);
		if (e->map == NULL)
			return (idn_nomemory);
	} else {
		/*
		 * Zero 'maplen' is perfectly valid meaning one-to-zero
		 * mapping.
		 */
		e->map = NULL;
	}
	ctx->nentries++;

	return (idn_success);
}

void
idn_ucsmap_fix(idn_ucsmap_t ctx) {
	ucsmap_entry_t *e;
	int last_hidx;
	int i;

	assert(ctx != NULL && ctx->refcnt > 0);

	TRACE(("idn_ucsmap_fix()\n"));

	if (ctx->fixed)
		return;

	ctx->fixed = 1;

	/* Initialize hash. */
	for (i = 0; i < UCSMAP_HASH_SIZE; i++) {
		ctx->hash[i].entry = NULL;
		ctx->hash[i].n = 0;
	}

	if (ctx->nentries == 0)
		return;

	/* Sort entries by the hash value and code point. */
	qsort(ctx->entries, ctx->nentries, sizeof(ucsmap_entry_t), comp_entry);

	/*
	 * Now the entries are sorted by their hash value, and
	 * sorted by its code point among the ones with the same hash value.
	 */

	/* Build hash table. */
	last_hidx = -1;
	for (i = 0, e = ctx->entries; i < ctx->nentries; i++, e++) {
		if (e->hidx != last_hidx) {
			ctx->hash[e->hidx].entry = e;
			last_hidx = e->hidx;
		}
		ctx->hash[last_hidx].n++;
	}
}

idn_result_t
idn_ucsmap_map(idn_ucsmap_t ctx, unsigned long v, unsigned long *to,
	       size_t tolen, size_t *maplenp) {
	int hash;
	ucsmap_entry_t *e;
	int n;
	int hi, lo, mid;

	assert(ctx != NULL && ctx->refcnt > 0 && to != NULL &&
	       maplenp != NULL);

	TRACE(("idn_ucsmap_map(v=U+%lX)\n", v));

	if (!ctx->fixed) {
		WARNING(("idn_ucsmap_map: not fixed yet\n"));
		return (idn_failure);
	}

	/* First, look up hash table. */
	hash = ucsmap_hash(v);
	if ((n = ctx->hash[hash].n) == 0)
		goto nomap;

	/* Then do binary search. */
	e = ctx->hash[hash].entry;
	lo = 0;
	hi = n - 1;
	while (lo <= hi) {
		mid = (lo + hi) / 2;
		if (v < e[mid].ucs)
			hi = mid - 1;
		else if (v > e[mid].ucs)
			lo = mid + 1;
		else {
			/* Found. */
			if (tolen < e[mid].len)
				return (idn_buffer_overflow);
			memcpy(to, e[mid].map, sizeof(*to) * e[mid].len);
			*maplenp = e[mid].len;
			return (idn_success);
		}
	}

	/*
	 * Not found. Put the original character to 'to'
	 * just for convenience.
	 */
 nomap:
	if (tolen < 1)
		return (idn_buffer_overflow);
	*to = v;
	*maplenp = 1;
	return (idn_nomapping);
}

static int
ucsmap_hash(unsigned long v) {
	return (v % UCSMAP_HASH_SIZE);
}

static unsigned long *
save_mapped_sequence(idn_ucsmap_t ctx, unsigned long *map, size_t maplen) {
	ucsmap_buf_t *buf;
	unsigned long *p;
	size_t allocsize;

	/*
	 * If the current buffer (the first one in the ctx->mapdata list)
	 * has enough space, use it.  Otherwise, allocate a new buffer and
	 * insert it at the beginning of the list.
	 */
	if (ctx->mapdata_used + maplen > ctx->mapdata_size) {
		if (maplen > DEFAULT_BUF_SIZE)
			allocsize = maplen * 2;
		else
			allocsize = DEFAULT_BUF_SIZE;
		buf = malloc(sizeof(ucsmap_hash_t) +
			     sizeof(unsigned long) * allocsize);
		if (buf == NULL)
			return (NULL);
		buf->next = ctx->mapdata;
		ctx->mapdata = buf;
		ctx->mapdata_size = allocsize;
		ctx->mapdata_used = 0;
	}
	p = ctx->mapdata->buf + ctx->mapdata_used;
	memcpy(p, map, sizeof(unsigned long) * maplen);
	ctx->mapdata_used += maplen;
	return (p);
}

static void
free_mapbuf(ucsmap_buf_t *buf) {
	while (buf != NULL) {
		ucsmap_buf_t *next = buf->next;
		free(buf);
		buf = next;
	}
}

static int
comp_entry(const void *v1, const void *v2) {
	const ucsmap_entry_t *e1 = v1;
	const ucsmap_entry_t *e2 = v2;

	if (e1->hidx < e2->hidx)
		return (-1);
	else if (e1->hidx > e2->hidx)
		return (1);
	else if (e1->ucs < e2->ucs)
		return (-1);
	else if (e1->ucs > e2->ucs)
		return (1);
	else
		return (0);
}