gdkanji.c   [plain text]



/* gdkanji.c (Kanji code converter)                            */
/*                 written by Masahito Yamaga (ma@yama-ga.com) */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "gd.h"
#include "gdhelpers.h"

#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif

#include <stdarg.h>
#if defined(HAVE_ICONV_H)
#include <iconv.h>
#endif

#ifndef HAVE_ICONV_T_DEF
typedef void *iconv_t;
#endif

#ifndef HAVE_ICONV
#define ICONV_CONST /**/
  iconv_t iconv_open (const char *, const char *);
size_t iconv (iconv_t, ICONV_CONST char **, size_t *, char **, size_t *);
int iconv_close (iconv_t);

iconv_t
iconv_open (const char *tocode, const char *fromcode)
{
  return (iconv_t) (-1);
}

size_t
iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft,
       char **outbuf, size_t * outbytesleft)
{
  return 0;
}

int
iconv_close (iconv_t cd)
{
  return 0;
}

#endif /* !HAVE_ICONV */

#define LIBNAME "any2eucjp()"

#if defined(__MSC__) || defined(__BORLANDC__) || defined(__TURBOC__) || defined(_Windows) || defined(MSDOS)
#ifndef SJISPRE
#define SJISPRE 1
#endif
#endif

#ifdef TRUE
#undef TRUE
#endif
#ifdef FALSE
#undef FALSE
#endif

#define TRUE  1
#define FALSE 0

#define NEW 1
#define OLD 2
#define ESCI 3
#define NEC 4
#define EUC 5
#define SJIS 6
#define EUCORSJIS 7
#define ASCII 8

#define NEWJISSTR "JIS7"
#define OLDJISSTR "jis"
#define EUCSTR    "eucJP"
#define SJISSTR   "SJIS"

#define ESC 27
#define SS2 142

static void
debug (const char *format, ...)
{
#ifdef DEBUG
  va_list args;

  va_start (args, format);
  fprintf (stdout, "%s: ", LIBNAME);
  vfprintf (stdout, format, args);
  fprintf (stdout, "\n");
  va_end (args);
#endif
}

static void
error (const char *format, ...)
{
  va_list args;

  va_start (args, format);
  fprintf (stderr, "%s: ", LIBNAME);
  vfprintf (stderr, format, args);
  fprintf (stderr, "\n");
  va_end (args);
}

/* DetectKanjiCode() derived from DetectCodeType() by Ken Lunde. */

static int
DetectKanjiCode (unsigned char *str)
{
  static int whatcode = ASCII;
  int oldcode = ASCII;
  int c, i;
  char *lang = NULL;

  c = '\1';
  i = 0;

  if (whatcode != EUCORSJIS && whatcode != ASCII)
    {
      oldcode = whatcode;
      whatcode = ASCII;
    }

  while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != '\0')
    {
      if ((c = str[i++]) != '\0')
	{
	  if (c == ESC)
	    {
	      c = str[i++];
	      if (c == '$')
		{
		  c = str[i++];
		  if (c == 'B')
		    whatcode = NEW;
		  else if (c == '@')
		    whatcode = OLD;
		}
	      else if (c == '(')
		{
		  c = str[i++];
		  if (c == 'I')
		    whatcode = ESCI;
		}
	      else if (c == 'K')
		whatcode = NEC;
	    }
	  else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
	    whatcode = SJIS;
	  else if (c == SS2)
	    {
	      c = str[i++];
	      if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160)
		  || (c >= 224 && c <= 252))
		whatcode = SJIS;
	      else if (c >= 161 && c <= 223)
		whatcode = EUCORSJIS;
	    }
	  else if (c >= 161 && c <= 223)
	    {
	      c = str[i++];
	      if (c >= 240 && c <= 254)
		whatcode = EUC;
	      else if (c >= 161 && c <= 223)
		whatcode = EUCORSJIS;
	      else if (c >= 224 && c <= 239)
		{
		  whatcode = EUCORSJIS;
		  while (c >= 64 && c != '\0' && whatcode == EUCORSJIS)
		    {
		      if (c >= 129)
			{
			  if (c <= 141 || (c >= 143 && c <= 159))
			    whatcode = SJIS;
			  else if (c >= 253 && c <= 254)
			    whatcode = EUC;
			}
		      c = str[i++];
		    }
		}
	      else if (c <= 159)
		whatcode = SJIS;
	    }
	  else if (c >= 240 && c <= 254)
	    whatcode = EUC;
	  else if (c >= 224 && c <= 239)
	    {
	      c = str[i++];
	      if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
		whatcode = SJIS;
	      else if (c >= 253 && c <= 254)
		whatcode = EUC;
	      else if (c >= 161 && c <= 252)
		whatcode = EUCORSJIS;
	    }
	}
    }

#ifdef DEBUG
  if (whatcode == ASCII)
    debug ("Kanji code not included.");
  else if (whatcode == EUCORSJIS)
    debug ("Kanji code not detected.");
  else
    debug ("Kanji code detected at %d byte.", i);
#endif

  if (whatcode == EUCORSJIS && oldcode != ASCII)
    whatcode = oldcode;

  if (whatcode == EUCORSJIS)
    {
      if (getenv ("LC_ALL"))
	lang = getenv ("LC_ALL");
      else if (getenv ("LC_CTYPE"))
	lang = getenv ("LC_CTYPE");
      else if (getenv ("LANG"))
	lang = getenv ("LANG");

      if (lang)
	{
	  if (strcmp (lang, "ja_JP.SJIS") == 0 ||
#ifdef hpux
	      strcmp (lang, "japanese") == 0 ||
#endif
	      strcmp (lang, "ja_JP.mscode") == 0 ||
	      strcmp (lang, "ja_JP.PCK") == 0)
	    whatcode = SJIS;
	  else if (strncmp (lang, "ja", 2) == 0)
#ifdef SJISPRE
	    whatcode = SJIS;
#else
	    whatcode = EUC;
#endif
	}
    }

  if (whatcode == EUCORSJIS)
#ifdef SJISPRE
    whatcode = SJIS;
#else
    whatcode = EUC;
#endif

  return whatcode;
}

/* SJIStoJIS() is sjis2jis() by Ken Lunde. */

static void
SJIStoJIS (int *p1, int *p2)
{
  register unsigned char c1 = *p1;
  register unsigned char c2 = *p2;
  register int adjust = c2 < 159;
  register int rowOffset = c1 < 160 ? 112 : 176;
  register int cellOffset = adjust ? (31 + (c2 > 127)) : 126;

  *p1 = ((c1 - rowOffset) << 1) - adjust;
  *p2 -= cellOffset;
}

/* han2zen() was derived from han2zen() written by Ken Lunde. */

#define IS_DAKU(c) ((c >= 182 && c <= 196) || (c >= 202 && c <= 206) || (c == 179))
#define IS_HANDAKU(c) (c >= 202 && c <= 206)

static void
han2zen (int *p1, int *p2)
{
  int c = *p1;
  int daku = FALSE;
  int handaku = FALSE;
  int mtable[][2] = {
    {129, 66},
    {129, 117},
    {129, 118},
    {129, 65},
    {129, 69},
    {131, 146},
    {131, 64},
    {131, 66},
    {131, 68},
    {131, 70},
    {131, 72},
    {131, 131},
    {131, 133},
    {131, 135},
    {131, 98},
    {129, 91},
    {131, 65},
    {131, 67},
    {131, 69},
    {131, 71},
    {131, 73},
    {131, 74},
    {131, 76},
    {131, 78},
    {131, 80},
    {131, 82},
    {131, 84},
    {131, 86},
    {131, 88},
    {131, 90},
    {131, 92},
    {131, 94},
    {131, 96},
    {131, 99},
    {131, 101},
    {131, 103},
    {131, 105},
    {131, 106},
    {131, 107},
    {131, 108},
    {131, 109},
    {131, 110},
    {131, 113},
    {131, 116},
    {131, 119},
    {131, 122},
    {131, 125},
    {131, 126},
    {131, 128},
    {131, 129},
    {131, 130},
    {131, 132},
    {131, 134},
    {131, 136},
    {131, 137},
    {131, 138},
    {131, 139},
    {131, 140},
    {131, 141},
    {131, 143},
    {131, 147},
    {129, 74},
    {129, 75}
  };

  if (*p2 == 222 && IS_DAKU (*p1))
    daku = TRUE;		/* Daku-ten */
  else if (*p2 == 223 && IS_HANDAKU (*p1))
    handaku = TRUE;		/* Han-daku-ten */

  *p1 = mtable[c - 161][0];
  *p2 = mtable[c - 161][1];

  if (daku)
    {
      if ((*p2 >= 74 && *p2 <= 103) || (*p2 >= 110 && *p2 <= 122))
	(*p2)++;
      else if (*p2 == 131 && *p2 == 69)
	*p2 = 148;
    }
  else if (handaku && *p2 >= 110 && *p2 <= 122)
    (*p2) += 2;
}

/* Recast strcpy to handle unsigned chars used below. */
#define ustrcpy(A,B) (strcpy((char*)(A),(const char*)(B)))

static void
do_convert (unsigned char *to, unsigned char *from, const char *code)
{
#ifdef HAVE_ICONV
  iconv_t cd;
  size_t from_len, to_len;

  if ((cd = iconv_open (EUCSTR, code)) == (iconv_t) - 1)
    {
      error ("iconv_open() error");
#ifdef HAVE_ERRNO_H
      if (errno == EINVAL)
	error ("invalid code specification: \"%s\" or \"%s\"", EUCSTR, code);
#endif
      strcpy ((char *) to, (const char *) from);
      return;
    }

  from_len = strlen ((const char *) from) + 1;
  to_len = BUFSIZ;

  if ((int) (iconv (cd, (char **) &from, &from_len, (char **) &to, &to_len))
      == -1)
    {
#ifdef HAVE_ERRNO_H
      if (errno == EINVAL)
	error ("invalid end of input string");
      else if (errno == EILSEQ)
	error ("invalid code in input string");
      else if (errno == E2BIG)
	error ("output buffer overflow at do_convert()");
      else
#endif
	error ("something happen");
      strcpy ((char *) to, (const char *) from);
      return;
    }

  if (iconv_close (cd) != 0)
    {
      error ("iconv_close() error");
    }
#else
  int p1, p2, i, j;
  int jisx0208 = FALSE;
  int hankaku = FALSE;

  j = 0;
  if (strcmp (code, NEWJISSTR) == 0 || strcmp (code, OLDJISSTR) == 0)
    {
      for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
	{
	  if (from[i] == ESC)
	    {
	      i++;
	      if (from[i] == '$')
		{
		  jisx0208 = TRUE;
		  hankaku = FALSE;
		  i++;
		}
	      else if (from[i] == '(')
		{
		  jisx0208 = FALSE;
		  i++;
		  if (from[i] == 'I')	/* Hankaku Kana */
		    hankaku = TRUE;
		  else
		    hankaku = FALSE;
		}
	    }
	  else
	    {
	      if (jisx0208)
		to[j++] = from[i] + 128;
	      else if (hankaku)
		{
		  to[j++] = SS2;
		  to[j++] = from[i] + 128;
		}
	      else
		to[j++] = from[i];
	    }
	}
    }
  else if (strcmp (code, SJISSTR) == 0)
    {
      for (i = 0; from[i] != '\0' && j < BUFSIZ; i++)
	{
	  p1 = from[i];
	  if (p1 < 127)
	    to[j++] = p1;
	  else if ((p1 >= 161) && (p1 <= 223))
	    {			/* Hankaku Kana */
	      to[j++] = SS2;
	      to[j++] = p1;
	    }
	  else
	    {
	      p2 = from[++i];
	      SJIStoJIS (&p1, &p2);
	      to[j++] = p1 + 128;
	      to[j++] = p2 + 128;
	    }
	}
    }
  else
    {
      error ("invalid code specification: \"%s\"", code);
      return;
    }

  if (j >= BUFSIZ)
    {
      error ("output buffer overflow at do_convert()");
      ustrcpy (to, from);
    }
  else
    to[j] = '\0';
#endif /* HAVE_ICONV */
}

static int
do_check_and_conv (unsigned char *to, unsigned char *from)
{
  static unsigned char tmp[BUFSIZ];
  int p1, p2, i, j;
  int kanji = TRUE;

  switch (DetectKanjiCode (from))
    {
    case NEW:
      debug ("Kanji code is New JIS.");
      do_convert (tmp, from, NEWJISSTR);
      break;
    case OLD:
      debug ("Kanji code is Old JIS.");
      do_convert (tmp, from, OLDJISSTR);
      break;
    case ESCI:
      debug
	("This string includes Hankaku-Kana (jisx0201) escape sequence [ESC] + ( + I.");
      do_convert (tmp, from, NEWJISSTR);
      break;
    case NEC:
      debug ("Kanji code is NEC Kanji.");
      error ("cannot convert NEC Kanji.");
      ustrcpy (tmp, from);
      kanji = FALSE;
      break;
    case EUC:
      debug ("Kanji code is EUC.");
      ustrcpy (tmp, from);
      break;
    case SJIS:
      debug ("Kanji code is SJIS.");
      do_convert (tmp, from, SJISSTR);
      break;
    case EUCORSJIS:
      debug ("Kanji code is EUC or SJIS.");
      ustrcpy (tmp, from);
      kanji = FALSE;
      break;
    case ASCII:
      debug ("This is ASCII string.");
      ustrcpy (tmp, from);
      kanji = FALSE;
      break;
    default:
      debug ("This string includes unknown code.");
      ustrcpy (tmp, from);
      kanji = FALSE;
      break;
    }

  /* Hankaku Kana ---> Zenkaku Kana */
  if (kanji)
    {
      j = 0;
      for (i = 0; tmp[i] != '\0' && j < BUFSIZ; i++)
	{
	  if (tmp[i] == SS2)
	    {
	      p1 = tmp[++i];
	      if (tmp[i + 1] == SS2)
		{
		  p2 = tmp[i + 2];
		  if (p2 == 222 || p2 == 223)
		    i += 2;
		  else
		    p2 = 0;
		}
	      else
		p2 = 0;
	      han2zen (&p1, &p2);
	      SJIStoJIS (&p1, &p2);
	      to[j++] = p1 + 128;
	      to[j++] = p2 + 128;
	    }
	  else
	    to[j++] = tmp[i];
	}

      if (j >= BUFSIZ)
	{
	  error ("output buffer overflow at Hankaku --> Zenkaku");
	  ustrcpy (to, tmp);
	}
      else
	to[j] = '\0';
    }
  else
    ustrcpy (to, tmp);

  return kanji;
}

int
any2eucjp (unsigned char *dest, unsigned char *src, unsigned int dest_max)
{
  static unsigned char tmp_dest[BUFSIZ];
  int ret;

  if (strlen ((const char *) src) >= BUFSIZ)
    {
      error ("input string too large");
      return -1;
    }
  if (dest_max > BUFSIZ)
    {
      error
	("invalid maximum size of destination\nit should be less than %d.",
	 BUFSIZ);
      return -1;
    }
  ret = do_check_and_conv (tmp_dest, src);
  if (strlen ((const char *) tmp_dest) >= dest_max)
    {
      error ("output buffer overflow");
      ustrcpy (dest, src);
      return -1;
    }
  ustrcpy (dest, tmp_dest);
  return ret;
}

#if 0
unsigned int
strwidth (unsigned char *s)
{
  unsigned char *t;
  unsigned int i;

  t = (unsigned char *) gdMalloc (BUFSIZ);
  any2eucjp (t, s, BUFSIZ);
  i = strlen (t);
  gdFree (t);
  return i;
}

#ifdef DEBUG
int
main ()
{
  unsigned char input[BUFSIZ];
  unsigned char *output;
  unsigned char *str;
  int c, i = 0;

  while ((c = fgetc (stdin)) != '\n' && i < BUFSIZ)
    input[i++] = c;
  input[i] = '\0';

  printf ("input : %d bytes\n", strlen ((const char *) input));
  printf ("output: %d bytes\n", strwidth (input));

  output = (unsigned char *) gdMalloc (BUFSIZ);
  any2eucjp (output, input, BUFSIZ);
  str = output;
  while (*str != '\0')
    putchar (*(str++));
  putchar ('\n');
  gdFree (output);

  return 0;
}
#endif
#endif