UtfOps.java   [plain text]


/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 2000-2003
 *      Sleepycat Software.  All rights reserved.
 *
 * $Id: UtfOps.java,v 1.2 2004/03/30 01:23:36 jtownsen Exp $
 */

package com.sleepycat.bdb.util;

import java.io.EOFException;
import java.io.UTFDataFormatException;

/**
 * UTF operations with more flexibility than is provided by DataInput and
 * DataOutput.
 *
 * @author Mark Hayes
 */
public class UtfOps {

    private static byte[] EMPTY_BYTES = {};
    private static String EMPTY_STRING = "";

    /**
     * Returns the byte length of a null terminated UTF string, not including
     * the terminator.
     *
     * @param bytes the data containing the UTF string.
     *
     * @param offset the beginning of the string the measure.
     *
     * @return the number of bytes.
     */
    public static int getZeroTerminatedByteLength(byte[] bytes, int offset)
        throws EOFException {

        try {
            int len = 0;
            while (bytes[offset++] != 0) {
                len++;
            }
            return len;
        } catch (IndexOutOfBoundsException e) {
            throw new EOFException(e.toString());
        }
    }

    /**
     * Returns the byte length of the UTF string that would be created by
     * converting the given characters to UTF.
     *
     * @param chars the characters that would be converted.
     *
     * @return the byte length of the equivalent UTF data.
     */
    public static int getByteLength(char[] chars) {

        return getByteLength(chars, 0, chars.length);
    }

    /**
     * Returns the byte length of the UTF string that would be created by
     * converting the given characters to UTF.
     *
     * @param chars the characters that would be converted.
     *
     * @param offset the first character to be converted.
     *
     * @param length the number of characters to be converted.
     *
     * @return the byte length of the equivalent UTF data.
     */
    public static int getByteLength(char[] chars, int offset, int length) {

        int len = 0;
        length += offset;
        for (int i = offset; i < length; i++) {
            int c = chars[i];
            if ((c >= 0x0001) && (c <= 0x007F)) {
                len++;
            } else if (c > 0x07FF) {
                len += 3;
            } else {
                len += 2;
            }
        }
        return len;
    }

    /**
     * Returns the number of characters represented by the given UTF string.
     *
     * @param bytes the UTF string.
     *
     * @return the number of characters.
     *
     * @throws EOFException if a UTF character sequence at the end of the data
     * is not complete.
     *
     * @throws UTFDataFormatException if an illegal UTF sequence is
     * encountered.
     */
    public static int getCharLength(byte[] bytes)
        throws UTFDataFormatException, EOFException {

        return getCharLength(bytes, 0, bytes.length);
    }

    /**
     * Returns the number of characters represented by the given UTF string.
     *
     * @param bytes the data containing the UTF string.
     *
     * @param offset the first byte to be converted.
     *
     * @param length the number of byte to be converted.
     *
     * @throws EOFException if a UTF character sequence at the end of the data
     * is not complete.
     *
     * @throws UTFDataFormatException if an illegal UTF sequence is
     * encountered.
     */
    public static int getCharLength(byte[] bytes, int offset, int length)
        throws UTFDataFormatException, EOFException {

        try {
            int charCount = 0;
            length += offset;
            while (offset < length) {
                switch ((bytes[offset] & 0xff) >> 4) {
                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
                    offset++;
                    break;
                case 12: case 13:
                    offset += 2;
                    break;
                case 14:
                    offset += 3;
                    break;
                default:
                    throw new UTFDataFormatException();
                }
                charCount++;
            }
            return charCount;
        } catch (IndexOutOfBoundsException e) {
            throw new EOFException(e.toString());
        }
    }

    /**
     * Converts byte arrays into character arrays.
     *
     * @param bytes the source byte data to convert
     *
     * @param byteOffset the offset into the byte array at which
     * to start the conversion
     *
     * @param chars the destination array
     *
     * @param charOffset the offset into chars at which to begin the copy
     *
     * @param len the amount of information to copy into chars
     *
     * @param isByteLen if true then len is a measure of bytes, otherwise
     * len is a measure of characters
     *
     * @throws EOFException if a UTF character sequence at the end of the data
     * is not complete.
     *
     * @throws UTFDataFormatException if an illegal UTF sequence is
     * encountered.
     */
    public static int bytesToChars(byte[] bytes, int byteOffset,
                                   char[] chars, int charOffset,
                                   int len, boolean isByteLen)
        throws UTFDataFormatException, EOFException {

        try {
            int char1, char2, char3;
            len += isByteLen ? byteOffset : charOffset;
            while ((isByteLen ? byteOffset : charOffset) < len) {
                char1 = bytes[byteOffset++] & 0xff;
                switch ((char1 & 0xff) >> 4) {
                case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
                    chars[charOffset++] = (char) char1;
                    break;
                case 12: case 13:
                    char2 = bytes[byteOffset++];
                    if ((char2 & 0xC0) != 0x80) {
                        throw new UTFDataFormatException();
                    }
                    chars[charOffset++] = (char)(((char1 & 0x1F) << 6) |
                                                  (char2 & 0x3F));
                    break;
                case 14:
                    char2 = bytes[byteOffset++];
                    char3 = bytes[byteOffset++];
                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
                        throw new UTFDataFormatException();
                    chars[charOffset++] = (char)(((char1 & 0x0F) << 12) |
                                                 ((char2 & 0x3F) << 6)  |
                                                 ((char3 & 0x3F) << 0));
                    break;
                default:
                    throw new UTFDataFormatException();
                }
            }
            return byteOffset;
        } catch (IndexOutOfBoundsException e) {
            throw new EOFException(e.toString());
        }
    }

    /**
     * Converts character arrays into byte arrays.
     *
     * @param chars the source character data to convert
     *
     * @param charOffset the offset into the character array at which
     * to start the conversion
     *
     * @param bytes the destination array
     *
     * @param byteOffset the offset into bytes at which to begin the copy
     *
     * @param charLength the length of characters to copy into bytes
     */
    public static void charsToBytes(char[] chars, int charOffset,
                                    byte[] bytes, int byteOffset,
                                    int charLength) {
        charLength += charOffset;
        for (int i = charOffset; i < charLength; i++) {
            int c = chars[i];
            if ((c >= 0x0001) && (c <= 0x007F)) {
                bytes[byteOffset++] = (byte) c;
            } else if (c > 0x07FF) {
                bytes[byteOffset++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
                bytes[byteOffset++] = (byte) (0x80 | ((c >>  6) & 0x3F));
                bytes[byteOffset++] = (byte) (0x80 | ((c >>  0) & 0x3F));
            } else {
                bytes[byteOffset++] = (byte) (0xC0 | ((c >>  6) & 0x1F));
                bytes[byteOffset++] = (byte) (0x80 | ((c >>  0) & 0x3F));
            }
        }
    }

    /**
     * Converts byte arrays into strings.
     *
     * @param bytes the source byte data to convert
     *
     * @param offset the offset into the byte array at which
     * to start the conversion
     *
     * @param length the number of bytes to be converted.
     *
     * @return the string.
     *
     * @throws EOFException if a UTF character sequence at the end of the data
     * is not complete.
     *
     * @throws UTFDataFormatException if an illegal UTF sequence is
     * encountered.
     */
    public static String bytesToString(byte[] bytes, int offset, int length)
        throws UTFDataFormatException, EOFException {

        if (length == 0) return EMPTY_STRING;
        int charLen = UtfOps.getCharLength(bytes, offset, length);
        char[] chars = new char[charLen];
        UtfOps.bytesToChars(bytes, offset, chars, 0, length, true);
        return new String(chars, 0, charLen);
    }

    /**
     * Converts strings to byte arrays.
     *
     * @param string the string to convert.
     *
     * @return the UTF byte array.
     */
    public static byte[] stringToBytes(String string) {

        if (string.length() == 0) return EMPTY_BYTES;
        char[] chars = string.toCharArray();
        byte[] bytes = new byte[UtfOps.getByteLength(chars)];
        UtfOps.charsToBytes(chars, 0, bytes, 0, chars.length);
        return bytes;
    }
}