loop_unicode.h   [plain text]


/*
 * Copyright (C) 1999-2003, 2005-2006 Free Software Foundation, Inc.
 * This file is part of the GNU LIBICONV Library.
 *
 * The GNU LIBICONV Library is free software; you can redistribute it
 * and/or modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * The GNU LIBICONV Library is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
 * Fifth Floor, Boston, MA 02110-1301, USA.
 */

/* This file defines the conversion loop via Unicode as a pivot encoding. */

/* Attempt to transliterate wc. Return code as in xxx_wctomb. */
static int unicode_transliterate (conv_t cd, ucs4_t wc,
                                  unsigned char* outptr, size_t outleft)
{
  if (cd->oflags & HAVE_HANGUL_JAMO) {
    /* Decompose Hangul into Jamo. Use double-width Jamo (contained
       in all Korean encodings and ISO-2022-JP-2), not half-width Jamo
       (contained in Unicode only). */
    ucs4_t buf[3];
    int ret = johab_hangul_decompose(cd,buf,wc);
    if (ret != RET_ILUNI) {
      /* we know 1 <= ret <= 3 */
      state_t backup_state = cd->ostate;
      unsigned char* backup_outptr = outptr;
      size_t backup_outleft = outleft;
      int i, sub_outcount;
      for (i = 0; i < ret; i++) {
        if (outleft == 0) {
          sub_outcount = RET_TOOSMALL;
          goto johab_hangul_failed;
        }
        sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
        if (sub_outcount <= RET_ILUNI)
          goto johab_hangul_failed;
        if (!(sub_outcount <= outleft)) abort();
        outptr += sub_outcount; outleft -= sub_outcount;
      }
      return outptr-backup_outptr;
    johab_hangul_failed:
      cd->ostate = backup_state;
      outptr = backup_outptr;
      outleft = backup_outleft;
      if (sub_outcount != RET_ILUNI)
        return RET_TOOSMALL;
    }
  }
  {
    /* Try to use a variant, but postfix it with
       U+303E IDEOGRAPHIC VARIATION INDICATOR
       (cf. Ken Lunde's "CJKV information processing", p. 188). */
    int indx = -1;
    if (wc == 0x3006)
      indx = 0;
    else if (wc == 0x30f6)
      indx = 1;
    else if (wc >= 0x4e00 && wc < 0xa000)
      indx = cjk_variants_indx[wc-0x4e00];
    if (indx >= 0) {
      for (;; indx++) {
        ucs4_t buf[2];
        unsigned short variant = cjk_variants[indx];
        unsigned short last = variant & 0x8000;
        variant &= 0x7fff;
        variant += 0x3000;
        buf[0] = variant; buf[1] = 0x303e;
        {
          state_t backup_state = cd->ostate;
          unsigned char* backup_outptr = outptr;
          size_t backup_outleft = outleft;
          int i, sub_outcount;
          for (i = 0; i < 2; i++) {
            if (outleft == 0) {
              sub_outcount = RET_TOOSMALL;
              goto variant_failed;
            }
            sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
            if (sub_outcount <= RET_ILUNI)
              goto variant_failed;
            if (!(sub_outcount <= outleft)) abort();
            outptr += sub_outcount; outleft -= sub_outcount;
          }
          return outptr-backup_outptr;
        variant_failed:
          cd->ostate = backup_state;
          outptr = backup_outptr;
          outleft = backup_outleft;
          if (sub_outcount != RET_ILUNI)
            return RET_TOOSMALL;
        }
        if (last)
          break;
      }
    }
  }
  if (wc >= 0x2018 && wc <= 0x201a) {
    /* Special case for quotation marks 0x2018, 0x2019, 0x201a */
    ucs4_t substitute =
      (cd->oflags & HAVE_QUOTATION_MARKS
       ? (wc == 0x201a ? 0x2018 : wc)
       : (cd->oflags & HAVE_ACCENTS
          ? (wc==0x2019 ? 0x00b4 : 0x0060) /* use accents */
          : 0x0027 /* use apostrophe */
      )  );
    int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,substitute,outleft);
    if (outcount != RET_ILUNI)
      return outcount;
  }
  {
    /* Use the transliteration table. */
    int indx = translit_index(wc);
    if (indx >= 0) {
      const unsigned int * cp = &translit_data[indx];
      unsigned int num = *cp++;
      state_t backup_state = cd->ostate;
      unsigned char* backup_outptr = outptr;
      size_t backup_outleft = outleft;
      unsigned int i;
      int sub_outcount;
      for (i = 0; i < num; i++) {
        if (outleft == 0) {
          sub_outcount = RET_TOOSMALL;
          goto translit_failed;
        }
        sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,cp[i],outleft);
        if (sub_outcount == RET_ILUNI)
          /* Recursive transliteration. */
          sub_outcount = unicode_transliterate(cd,cp[i],outptr,outleft);
        if (sub_outcount <= RET_ILUNI)
          goto translit_failed;
        if (!(sub_outcount <= outleft)) abort();
        outptr += sub_outcount; outleft -= sub_outcount;
      }
      return outptr-backup_outptr;
    translit_failed:
      cd->ostate = backup_state;
      outptr = backup_outptr;
      outleft = backup_outleft;
      if (sub_outcount != RET_ILUNI)
        return RET_TOOSMALL;
    }
  }
  return RET_ILUNI;
}

#ifndef LIBICONV_PLUG

struct uc_to_mb_fallback_locals {
  unsigned char* l_outbuf;
  size_t l_outbytesleft;
  int l_errno;
};

static void uc_to_mb_write_replacement (const char *buf, size_t buflen,
                                        void* callback_arg)
{
  struct uc_to_mb_fallback_locals * plocals =
    (struct uc_to_mb_fallback_locals *) callback_arg;
  /* Do nothing if already encountered an error in a previous call. */
  if (plocals->l_errno == 0) {
    /* Attempt to copy the passed buffer to the output buffer. */
    if (plocals->l_outbytesleft < buflen)
      plocals->l_errno = E2BIG;
    else {
      memcpy(plocals->l_outbuf, buf, buflen);
      plocals->l_outbuf += buflen;
      plocals->l_outbytesleft -= buflen;
    }
  }
}

struct mb_to_uc_fallback_locals {
  conv_t l_cd;
  unsigned char* l_outbuf;
  size_t l_outbytesleft;
  int l_errno;
};

static void mb_to_uc_write_replacement (const unsigned int *buf, size_t buflen,
                                        void* callback_arg)
{
  struct mb_to_uc_fallback_locals * plocals =
    (struct mb_to_uc_fallback_locals *) callback_arg;
  /* Do nothing if already encountered an error in a previous call. */
  if (plocals->l_errno == 0) {
    /* Attempt to convert the passed buffer to the target encoding. */
    conv_t cd = plocals->l_cd;
    unsigned char* outptr = plocals->l_outbuf;
    size_t outleft = plocals->l_outbytesleft;
    for (; buflen > 0; buf++, buflen--) {
      ucs4_t wc = *buf;
      int outcount;
      if (outleft == 0) {
        plocals->l_errno = E2BIG;
        break;
      }
      outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
      if (outcount != RET_ILUNI)
        goto outcount_ok;
      /* Handle Unicode tag characters (range U+E0000..U+E007F). */
      if ((wc >> 7) == (0xe0000 >> 7))
        goto outcount_zero;
      /* Try transliteration. */
      if (cd->transliterate) {
        outcount = unicode_transliterate(cd,wc,outptr,outleft);
        if (outcount != RET_ILUNI)
          goto outcount_ok;
      }
      if (cd->discard_ilseq) {
        outcount = 0;
        goto outcount_ok;
      }
      #ifndef LIBICONV_PLUG
      else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
        struct uc_to_mb_fallback_locals locals;
        locals.l_outbuf = outptr;
        locals.l_outbytesleft = outleft;
        locals.l_errno = 0;
        cd->fallbacks.uc_to_mb_fallback(wc,
                                        uc_to_mb_write_replacement,
                                        &locals,
                                        cd->fallbacks.data);
        if (locals.l_errno != 0) {
          plocals->l_errno = locals.l_errno;
          break;
        }
        outptr = locals.l_outbuf;
        outleft = locals.l_outbytesleft;
        outcount = 0;
        goto outcount_ok;
      }
      #endif
      outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
      if (outcount != RET_ILUNI)
        goto outcount_ok;
      plocals->l_errno = EILSEQ;
      break;
    outcount_ok:
      if (outcount < 0) {
        plocals->l_errno = E2BIG;
        break;
      }
      #ifndef LIBICONV_PLUG
      if (cd->hooks.uc_hook)
        (*cd->hooks.uc_hook)(wc, cd->hooks.data);
      #endif
      if (!(outcount <= outleft)) abort();
      outptr += outcount; outleft -= outcount;
    outcount_zero: ;
    }
    plocals->l_outbuf = outptr;
    plocals->l_outbytesleft = outleft;
  }
}

#endif /* !LIBICONV_PLUG */

static size_t unicode_loop_convert (iconv_t icd,
                                    const char* * inbuf, size_t *inbytesleft,
                                    char* * outbuf, size_t *outbytesleft)
{
  conv_t cd = (conv_t) icd;
  size_t result = 0;
  const unsigned char* inptr = (const unsigned char*) *inbuf;
  size_t inleft = *inbytesleft;
  unsigned char* outptr = (unsigned char*) *outbuf;
  size_t outleft = *outbytesleft;
  while (inleft > 0) {
    state_t last_istate = cd->istate;
    ucs4_t wc;
    int incount;
    int outcount;
    incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft);
    if (incount < 0) {
      if (incount == RET_ILSEQ) {
        /* Case 1: invalid input */
        if (cd->discard_ilseq) {
          switch (cd->iindex) {
            case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
            case ei_utf32: case ei_utf32be: case ei_utf32le:
            case ei_ucs4internal: case ei_ucs4swapped:
              incount = 4; break;
            case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
            case ei_utf16: case ei_utf16be: case ei_utf16le:
            case ei_ucs2internal: case ei_ucs2swapped:
              incount = 2; break;
            default:
              incount = 1; break;
          }
          goto outcount_zero;
        }
        #ifndef LIBICONV_PLUG
        else if (cd->fallbacks.mb_to_uc_fallback != NULL) {
          struct mb_to_uc_fallback_locals locals;
          switch (cd->iindex) {
            case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
            case ei_utf32: case ei_utf32be: case ei_utf32le:
            case ei_ucs4internal: case ei_ucs4swapped:
              incount = 4; break;
            case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
            case ei_utf16: case ei_utf16be: case ei_utf16le:
            case ei_ucs2internal: case ei_ucs2swapped:
              incount = 2; break;
            default:
              incount = 1; break;
          }
          locals.l_cd = cd;
          locals.l_outbuf = outptr;
          locals.l_outbytesleft = outleft;
          locals.l_errno = 0;
          cd->fallbacks.mb_to_uc_fallback((const char *)inptr, incount,
                                          mb_to_uc_write_replacement,
                                          &locals,
                                          cd->fallbacks.data);
          if (locals.l_errno != 0) {
            errno = locals.l_errno;
            result = -1;
            break;
          }
          outptr = locals.l_outbuf;
          outleft = locals.l_outbytesleft;
          result += 1;
          goto outcount_zero;
        }
        #endif
        errno = EILSEQ;
        result = -1;
        break;
      }
      if (incount == RET_TOOFEW(0)) {
        /* Case 2: not enough bytes available to detect anything */
        errno = EINVAL;
        result = -1;
        break;
      }
      /* Case 3: k bytes read, but only a shift sequence */
      incount = -2-incount;
    } else {
      /* Case 4: k bytes read, making up a wide character */
      if (outleft == 0) {
        cd->istate = last_istate;
        errno = E2BIG;
        result = -1;
        break;
      }
      outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
      if (outcount != RET_ILUNI)
        goto outcount_ok;
      /* Handle Unicode tag characters (range U+E0000..U+E007F). */
      if ((wc >> 7) == (0xe0000 >> 7))
        goto outcount_zero;
      /* Try transliteration. */
      result++;
      if (cd->transliterate) {
        outcount = unicode_transliterate(cd,wc,outptr,outleft);
        if (outcount != RET_ILUNI)
          goto outcount_ok;
      }
      if (cd->discard_ilseq) {
        outcount = 0;
        goto outcount_ok;
      }
      #ifndef LIBICONV_PLUG
      else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
        struct uc_to_mb_fallback_locals locals;
        locals.l_outbuf = outptr;
        locals.l_outbytesleft = outleft;
        locals.l_errno = 0;
        cd->fallbacks.uc_to_mb_fallback(wc,
                                        uc_to_mb_write_replacement,
                                        &locals,
                                        cd->fallbacks.data);
        if (locals.l_errno != 0) {
          cd->istate = last_istate;
          errno = locals.l_errno;
          return -1;
        }
        outptr = locals.l_outbuf;
        outleft = locals.l_outbytesleft;
        outcount = 0;
        goto outcount_ok;
      }
      #endif
      outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
      if (outcount != RET_ILUNI)
        goto outcount_ok;
      cd->istate = last_istate;
      errno = EILSEQ;
      result = -1;
      break;
    outcount_ok:
      if (outcount < 0) {
        cd->istate = last_istate;
        errno = E2BIG;
        result = -1;
        break;
      }
      #ifndef LIBICONV_PLUG
      if (cd->hooks.uc_hook)
        (*cd->hooks.uc_hook)(wc, cd->hooks.data);
      #endif
      if (!(outcount <= outleft)) abort();
      outptr += outcount; outleft -= outcount;
    }
  outcount_zero:
    if (!(incount <= inleft)) abort();
    inptr += incount; inleft -= incount;
  }
  *inbuf = (const char*) inptr;
  *inbytesleft = inleft;
  *outbuf = (char*) outptr;
  *outbytesleft = outleft;
  return result;
}

static size_t unicode_loop_reset (iconv_t icd,
                                  char* * outbuf, size_t *outbytesleft)
{
  conv_t cd = (conv_t) icd;
  if (outbuf == NULL || *outbuf == NULL) {
    /* Reset the states. */
    memset(&cd->istate,'\0',sizeof(state_t));
    memset(&cd->ostate,'\0',sizeof(state_t));
    return 0;
  } else {
    size_t result = 0;
    if (cd->ifuncs.xxx_flushwc) {
      state_t last_istate = cd->istate;
      ucs4_t wc;
      if (cd->ifuncs.xxx_flushwc(cd, &wc)) {
        unsigned char* outptr = (unsigned char*) *outbuf;
        size_t outleft = *outbytesleft;
        int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
        if (outcount != RET_ILUNI)
          goto outcount_ok;
        /* Handle Unicode tag characters (range U+E0000..U+E007F). */
        if ((wc >> 7) == (0xe0000 >> 7))
          goto outcount_zero;
        /* Try transliteration. */
        result++;
        if (cd->transliterate) {
          outcount = unicode_transliterate(cd,wc,outptr,outleft);
          if (outcount != RET_ILUNI)
            goto outcount_ok;
        }
        if (cd->discard_ilseq) {
          outcount = 0;
          goto outcount_ok;
        }
        #ifndef LIBICONV_PLUG
        else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
          struct uc_to_mb_fallback_locals locals;
          locals.l_outbuf = outptr;
          locals.l_outbytesleft = outleft;
          locals.l_errno = 0;
          cd->fallbacks.uc_to_mb_fallback(wc,
                                          uc_to_mb_write_replacement,
                                          &locals,
                                          cd->fallbacks.data);
          if (locals.l_errno != 0) {
            cd->istate = last_istate;
            errno = locals.l_errno;
            return -1;
          }
          outptr = locals.l_outbuf;
          outleft = locals.l_outbytesleft;
          outcount = 0;
          goto outcount_ok;
        }
        #endif
        outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
        if (outcount != RET_ILUNI)
          goto outcount_ok;
        cd->istate = last_istate;
        errno = EILSEQ;
        return -1;
      outcount_ok:
        if (outcount < 0) {
          cd->istate = last_istate;
          errno = E2BIG;
          return -1;
        }
        #ifndef LIBICONV_PLUG
        if (cd->hooks.uc_hook)
          (*cd->hooks.uc_hook)(wc, cd->hooks.data);
        #endif
        if (!(outcount <= outleft)) abort();
        outptr += outcount;
        outleft -= outcount;
      outcount_zero:
        *outbuf = (char*) outptr;
        *outbytesleft = outleft;
      }
    }
    if (cd->ofuncs.xxx_reset) {
      unsigned char* outptr = (unsigned char*) *outbuf;
      size_t outleft = *outbytesleft;
      int outcount = cd->ofuncs.xxx_reset(cd,outptr,outleft);
      if (outcount < 0) {
        errno = E2BIG;
        return -1;
      }
      if (!(outcount <= outleft)) abort();
      *outbuf = (char*) (outptr + outcount);
      *outbytesleft = outleft - outcount;
    }
    memset(&cd->istate,'\0',sizeof(state_t));
    memset(&cd->ostate,'\0',sizeof(state_t));
    return result;
  }
}