/** * Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc. * * Author: Oliver Hitz * * This file is part of GNU Libidn. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 * USA */ package gnu.inet.encoding; public class NFKC { /** * Applies NFKC normalization to a string. * * @param in The string to normalize. * @return An NFKC normalized string. */ public static String normalizeNFKC(String in) { StringBuffer out = new StringBuffer(); for (int i = 0; i < in.length(); i++) { char code = in.charAt(i); // In Unicode 3.0, Hangul was defined as the block from U+AC00 // to U+D7A3, however, since Unicode 3.2 the block extends until // U+D7AF. The decomposeHangul function only decomposes until // U+D7A3. Should this be changed? if (code >= 0xAC00 && code <= 0xD7AF) { out.append(decomposeHangul(code)); } else { int index = decomposeIndex(code); if (index == -1) { out.append(code); } else { out.append(DecompositionMappings.m[index]); } } } // Bring the stringbuffer into canonical order. canonicalOrdering(out); // Do the canonical composition. int last_cc = 0; int last_start = 0; for (int i = 0; i < out.length(); i++) { int cc = combiningClass(out.charAt(i)); if (i > 0 && (last_cc == 0 || last_cc != cc)) { // Try to combine characters char a = out.charAt(last_start); char b = out.charAt(i); int c = compose(a, b); if (c != -1) { out.setCharAt(last_start, (char) c); out.deleteCharAt(i); i--; if (i == last_start) { last_cc = 0; } else { last_cc = combiningClass(out.charAt(i-1)); } continue; } } if (cc == 0) { last_start = i; } last_cc = cc; } return out.toString(); } /** * Returns the index inside the decomposition table, implemented * using a binary search. * * @param c Character to look up. * @return Index if found, -1 otherwise. */ static int decomposeIndex(char c) { int start = 0; int end = DecompositionKeys.k.length/2; while (true) { int half = (start + end) / 2; int code = DecompositionKeys.k[half*2]; if (c == code) { return DecompositionKeys.k[half*2 + 1]; } if (half == start) { // Character not found return -1; } else if (c > code) { start = half; } else { end = half; } } } /** * Returns the combining class of a given character. * * @param c The character. * @return The combining class. */ static int combiningClass(char c) { int h = c >> 8; int l = c & 0xff; int i = CombiningClass.i[h]; if (i > -1) { return CombiningClass.c[i][l]; } else { return 0; } } /** * Rearranges characters in a stringbuffer in order to respect the * canonical ordering properties. * * @param The StringBuffer to rearrange. */ static void canonicalOrdering(StringBuffer in) { boolean isOrdered = false; while (!isOrdered) { isOrdered = true; int lastCC = combiningClass(in.charAt(0)); for (int i = 0; i < in.length()-1; i++) { int nextCC = combiningClass(in.charAt(i+1)); if (nextCC != 0 && lastCC > nextCC) { for (int j = i+1; j > 0; j--) { if (combiningClass(in.charAt(j-1)) <= nextCC) { break; } char t = in.charAt(j); in.setCharAt(j, in.charAt(j-1)); in.setCharAt(j-1, t); isOrdered = false; } nextCC = lastCC; } lastCC = nextCC; } } } /** * Returns the index inside the composition table. * * @param a Character to look up. * @return Index if found, -1 otherwise. */ static int composeIndex(char a) { if (a>>8 >= Composition.composePage.length) { return -1; } int ap = Composition.composePage[a>>8]; if (ap == -1) { return -1; } return Composition.composeData[ap][a & 0xff]; } /** * Tries to compose two characters canonically. * * @param a First character. * @param b Second character. * @return The composed character or -1 if no composition could be * found. */ static int compose(char a, char b) { int h = composeHangul(a, b); if (h != -1) { return h; } int ai = composeIndex(a); if (ai >= Composition.singleFirstStart && ai < Composition.singleSecondStart) { if (b == Composition.singleFirst[ai - Composition.singleFirstStart][0]) { return Composition.singleFirst[ai - Composition.singleFirstStart][1]; } else { return -1; } } int bi = composeIndex(b); if (bi >= Composition.singleSecondStart) { if (a == Composition.singleSecond[bi - Composition.singleSecondStart][0]) { return Composition.singleSecond[bi - Composition.singleSecondStart][1]; } else { return -1; } } if (ai >= 0 && ai < Composition.multiSecondStart && bi >= Composition.multiSecondStart && bi < Composition.singleFirstStart) { char[] f = Composition.multiFirst[ai]; if (bi - Composition.multiSecondStart < f.length) { char r = f[bi - Composition.multiSecondStart]; if (r == 0) { return -1; } else { return r; } } } return -1; } /** * Entire hangul code copied from: * http://www.unicode.org/unicode/reports/tr15/ * * Several hangul specific constants */ static final int SBase = 0xAC00; static final int LBase = 0x1100; static final int VBase = 0x1161; static final int TBase = 0x11A7; static final int LCount = 19; static final int VCount = 21; static final int TCount = 28; static final int NCount = VCount * TCount; static final int SCount = LCount * NCount; /** * Decomposes a hangul character. * * @param s A character to decompose. * @return A string containing the hangul decomposition of the input * character. If no hangul decomposition can be found, a string * containing the character itself is returned. */ static String decomposeHangul(char s) { int SIndex = s - SBase; if (SIndex < 0 || SIndex >= SCount) { return String.valueOf(s); } StringBuffer result = new StringBuffer(); int L = LBase + SIndex / NCount; int V = VBase + (SIndex % NCount) / TCount; int T = TBase + SIndex % TCount; result.append((char)L); result.append((char)V); if (T != TBase) result.append((char)T); return result.toString(); } /** * Composes two hangul characters. * * @param a First character. * @param b Second character. * @return Returns the composed character or -1 if the two * characters cannot be composed. */ static int composeHangul(char a, char b) { // 1. check to see if two current characters are L and V int LIndex = a - LBase; if (0 <= LIndex && LIndex < LCount) { int VIndex = b - VBase; if (0 <= VIndex && VIndex < VCount) { // make syllable of form LV return SBase + (LIndex * VCount + VIndex) * TCount; } } // 2. check to see if two current characters are LV and T int SIndex = a - SBase; if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) { int TIndex = b - TBase; if (0 <= TIndex && TIndex <= TCount) { // make syllable of form LVT return a+TIndex; } } return -1; } }