uniname.c [plain text]

/* Association between Unicode characters and their names.
   Copyright (C) 2000-2002 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */

#if HAVE_CONFIG_H
# include <config.h>
#endif

/* Specification.  */
#include "uniname.h"

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>

#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


/* Table of Unicode character names, derived from UnicodeData.txt.  */
#define uint16_t unsigned short
#define uint32_t unsigned int
#include "uninames.h"
/* It contains:
  static const char unicode_name_words[26496] = ...;
  #define UNICODE_CHARNAME_NUM_WORDS 4725
  static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  #define UNICODE_CHARNAME_WORD_HANGUL 3030
  #define UNICODE_CHARNAME_WORD_SYLLABLE 3891
  #define UNICODE_CHARNAME_WORD_CJK 367
  #define UNICODE_CHARNAME_WORD_COMPATIBILITY 4585
  static const uint16_t unicode_names[53315] = ...;
  static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[12886] = ...;
  static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[12886] = ...;
  #define UNICODE_CHARNAME_MAX_LENGTH 83
  #define UNICODE_CHARNAME_MAX_WORDS 13
*/

/* Returns the word with a given index.  */
static const char *
unicode_name_word (unsigned int index, unsigned int *lengthp)
{
  unsigned int i1;
  unsigned int i2;
  unsigned int i;

  assert (index < UNICODE_CHARNAME_NUM_WORDS);

  /* Binary search for i with
       unicode_name_by_length[i].ind_offset <= index
     and
       index < unicode_name_by_length[i+1].ind_offset
   */

  i1 = 0;
  i2 = SIZEOF (unicode_name_by_length) - 1;
  while (i2 - i1 > 1)
    {
      unsigned int i = (i1 + i2) >> 1;
      if (unicode_name_by_length[i].ind_offset <= index)
	i1 = i;
      else
	i2 = i;
    }
  i = i1;
  assert (unicode_name_by_length[i].ind_offset <= index
	  && index < unicode_name_by_length[i+1].ind_offset);
  *lengthp = i;
  return &unicode_name_words[unicode_name_by_length[i].extra_offset
			     + (index-unicode_name_by_length[i].ind_offset)*i];
}

/* Looks up the index of a word.  */
static int
unicode_name_word_lookup (const char *word, unsigned int length)
{
  if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
    {
      /* Binary search among the words of given length.  */
      unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
      unsigned int i0 = unicode_name_by_length[length].ind_offset;
      unsigned int i1 = i0;
      unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
      while (i2 - i1 > 0)
	{
	  unsigned int i = (i1 + i2) >> 1;
	  const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
	  const char *w = word;
	  unsigned int n = length;
	  for (;;)
	    {
	      if (*p < *w)
		{
		  if (i1 == i)
		    return -1;
		  /* Note here: i1 < i < i2.  */
		  i1 = i;
		  break;
		}
	      if (*p > *w)
		{
		  /* Note here: i1 <= i < i2.  */
		  i2 = i;
		  break;
		}
	      p++; w++; n--;
	      if (n == 0)
		return i;
	    }
	}
    }
  return -1;
}

/* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
   sections 3.11 and 4.4.  */
static const char jamo_initial_short_name[19][3] =
{
  "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
  "C", "K", "T", "P", "H"
};
static const char jamo_medial_short_name[21][4] =
{
  "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
  "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
};
static const char jamo_final_short_name[28][3] =
{
  "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
  "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
};

/* Looks up the name of a Unicode character, in uppercase ASCII.
   Returns the filled buf, or NULL if the character does not have a name.  */
char *
unicode_character_name (unsigned int c, char *buf)
{
  if (c >= 0xAC00 && c <= 0xD7A3)
    {
      /* Special case for Hangul syllables. Keeps the tables small.  */
      char *ptr;
      unsigned int tmp;
      unsigned int index1;
      unsigned int index2;
      unsigned int index3;
      const char *q;

      /* buf needs to have at least 16 + 7 bytes here.  */
      memcpy (buf, "HANGUL SYLLABLE ", 16);
      ptr = buf + 16;

      tmp = c - 0xAC00;
      index3 = tmp % 28; tmp = tmp / 28;
      index2 = tmp % 21; tmp = tmp / 21;
      index1 = tmp;

      q = jamo_initial_short_name[index1];
      while (*q != '\0')
	*ptr++ = *q++;
      q = jamo_medial_short_name[index2];
      while (*q != '\0')
	*ptr++ = *q++;
      q = jamo_final_short_name[index3];
      while (*q != '\0')
	*ptr++ = *q++;
      *ptr = '\0';
      return buf;
    }
  else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
	   || (c >= 0x2F800 && c <= 0x2FA1D))
    {
      /* Special case for CJK compatibility ideographs. Keeps the tables
	 small.  */
      char *ptr;
      int i;

      /* buf needs to have at least 28 + 5 bytes here.  */
      memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
      ptr = buf + 28;

      for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
	{
	  unsigned int x = (c >> i) & 0xf;
	  *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
	}
      *ptr = '\0';
      return buf;
    }
  else
    {
      const uint16_t *words;

      /* Transform the code so that it fits in 16 bits.  */
      switch (c >> 12)
	{
	case 0x00: case 0x01: case 0x02: case 0x03:
	  break;
	case 0x0A:
	  c -= 0x06000;
	  break;
	case 0x0F:
	  c -= 0x0A000;
	  break;
	case 0x10:
	  c -= 0x0A000;
	  break;
	case 0x1D:
	  c -= 0x16000;
	  break;
	case 0x2F:
	  c -= 0x27000;
	  break;
	case 0xE0:
	  c -= 0xD7000;
	  break;
	default:
	  return NULL;
	}

      {
	/* Binary search in unicode_code_to_name.  */
	unsigned int i1 = 0;
	unsigned int i2 = SIZEOF (unicode_code_to_name);
	for (;;)
	  {
	    unsigned int i = (i1 + i2) >> 1;
	    if (unicode_code_to_name[i].code == c)
	      {
		words = &unicode_names[unicode_code_to_name[i].name];
		break;
	      }
	    else if (unicode_code_to_name[i].code < c)
	      {
		if (i1 == i)
		  {
		    words = NULL;
		    break;
		  }
		/* Note here: i1 < i < i2.  */
		i1 = i;
	      }
	    else if (unicode_code_to_name[i].code > c)
	      {
		if (i2 == i)
		  {
		    words = NULL;
		    break;
		  }
		/* Note here: i1 <= i < i2.  */
		i2 = i;
	      }
	  }
      }
      if (words != NULL)
	{
	  /* Found it in unicode_code_to_name. Now concatenate the words.  */
	  /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
	  char *ptr = buf;
	  for (;;)
	    {
	      unsigned int wordlen;
	      const char *word = unicode_name_word (*words>>1, &wordlen);
	      do
		*ptr++ = *word++;
	      while (--wordlen > 0);
	      if ((*words & 1) == 0)
		break;
	      *ptr++ = ' ';
	      words++;
	    }
	  *ptr = '\0';
	  return buf;
	}
      return NULL;
    }
}

/* Looks up the Unicode character with a given name, in upper- or lowercase
   ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
unsigned int
unicode_name_character (const char *name)
{
  unsigned int len = strlen (name);
  if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
    {
      /* Test for "word1 word2 ..." syntax.  */
      char buf[UNICODE_CHARNAME_MAX_LENGTH];
      char *ptr = buf;
      for (;;)
	{
	  char c = *name++;
	  if (!(c >= ' ' && c <= '~'))
	    break;
	  *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
	  if (--len == 0)
	    goto filled_buf;
	}
      if (false)
      filled_buf:
	{
	  /* Convert the constituents to uint16_t words.  */
	  uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
	  uint16_t *wordptr = words;
	  {
	    const char *p1 = buf;
	    for (;;)
	      {
		{
		  int word;
		  const char *p2 = p1;
		  while (p2 < ptr && *p2 != ' ')
		    p2++;
		  word = unicode_name_word_lookup (p1, p2 - p1);
		  if (word < 0)
		    break;
		  if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
		    break;
		  *wordptr++ = word;
		  if (p2 == ptr)
		    goto filled_words;
		  p1 = p2 + 1;
		}
		/* Special case for Hangul syllables. Keeps the tables small. */
		if (wordptr == &words[2]
		    && words[0] == UNICODE_CHARNAME_WORD_HANGUL
		    && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
		  {
		    /* Split the last word [p1..ptr) into three parts:
			 1) [BCDGHJKMNPRST]
			 2) [AEIOUWY]
			 3) [BCDGHIJKLMNPST]
		     */
		    const char *p2;
		    const char *p3;
		    const char *p4;

		    p2 = p1;
		    while (p2 < ptr
			   && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
			       || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
			       || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
			       || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
			       || *p2 == 'T'))
		      p2++;
		    p3 = p2;
		    while (p3 < ptr
			   && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
			       || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
			       || *p3 == 'Y'))
		      p3++;
		    p4 = p3;
		    while (p4 < ptr
			   && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
			       || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
			       || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
			       || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
			       || *p4 == 'S' || *p4 == 'T'))
		      p4++;
		    if (p4 == ptr)
		      {
			unsigned int n1 = p2 - p1;
			unsigned int n2 = p3 - p2;
			unsigned int n3 = p4 - p3;

			if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
			  {
			    unsigned int index1;

			    for (index1 = 0; index1 < 19; index1++)
			      if (memcmp(jamo_initial_short_name[index1], p1, n1) == 0
				  && jamo_initial_short_name[index1][n1] == '\0')
				{
				  unsigned int index2;

				  for (index2 = 0; index2 < 21; index2++)
				    if (memcmp(jamo_medial_short_name[index2], p2, n2) == 0
					&& jamo_medial_short_name[index2][n2] == '\0')
				      {
					unsigned int index3;

					for (index3 = 0; index3 < 28; index3++)
					  if (memcmp(jamo_final_short_name[index3], p3, n3) == 0
					      && jamo_final_short_name[index3][n3] == '\0')
					    {
					      return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
					    }
					break;
				      }
				  break;
				}
			  }
		      }
		  }
		/* Special case for CJK compatibility ideographs. Keeps the
		   tables small.  */
		if (wordptr == &words[2]
		    && words[0] == UNICODE_CHARNAME_WORD_CJK
		    && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
		    && p1 + 14 <= ptr
		    && p1 + 15 >= ptr
		    && memcmp (p1, "IDEOGRAPH-", 10) == 0)
		  {
		    const char *p2 = p1 + 10;

		    if (*p2 != '0')
		      {
			unsigned int c = 0;

			for (;;)
			  {
			    if (*p2 >= '0' && *p2 <= '9')
			      c += (*p2 - '0');
			    else if (*p2 >= 'A' && *p2 <= 'F')
			      c += (*p2 - 'A' + 10);
			    else
			      break;
			    p2++;
			    if (p2 == ptr)
			      {
				if ((c >= 0xF900 && c <= 0xFA2D)
				    || (c >= 0xFA30 && c <= 0xFA6A)
				    || (c >= 0x2F800 && c <= 0x2FA1D))
				  return c;
				else
				  break;
			      }
			    c = c << 4;
			  }
		      }
		  }
	      }
	  }
	  if (false)
	  filled_words:
	    {
	      /* Multiply by 2, to simplify later comparisons.  */
	      unsigned int words_length = wordptr - words;
	      {
		int i = words_length - 1;
		words[i] = 2 * words[i];
		for (; --i >= 0; )
		  words[i] = 2 * words[i] + 1;
	      }
	      /* Binary search in unicode_name_to_code.  */
	      {
		unsigned int i1 = 0;
		unsigned int i2 = SIZEOF (unicode_name_to_code);
		for (;;)
		  {
		    unsigned int i = (i1 + i2) >> 1;
		    const uint16_t *w = words;
		    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name];
		    unsigned int n = words_length;
		    for (;;)
		      {
			if (*p < *w)
			  {
			    if (i1 == i)
			      goto name_not_found;
			    /* Note here: i1 < i < i2.  */
			    i1 = i;
			    break;
			  }
			else if (*p > *w)
			  {
			    if (i2 == i)
			      goto name_not_found;
			    /* Note here: i1 <= i < i2.  */
			    i2 = i;
			    break;
			  }
			p++; w++; n--;
			if (n == 0)
			  {
			    unsigned int c = unicode_name_to_code[i].code;

			    /* Undo the transformation to 16-bit space.  */
			    static const unsigned int offset[10] =
			      {
				0x00000, 0x00000, 0x00000, 0x00000, 0x06000,
				0x0A000, 0x0A000, 0x16000, 0x27000, 0xD7000
			      };
			    return c + offset[c >> 12];
			  }
		      }
		  }
	      }
	    name_not_found: ;
	    }
	}
    }
  return UNINAME_INVALID;
}