#-------------------------------------------------------------------- # Copyright (c) 1999-2004, International Business Machines # Corporation and others. All Rights Reserved. #-------------------------------------------------------------------- # note: a global filter is more efficient, but MUST include all source chars :: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ; :: NFKC (); # Hiragana-Katakana # This is largely a one-to-one mapping, but it has a # few kinks: # 1. The Katakana va/vi/ve/vo (30F7-30FA) have no # Hiragana equivalents. We use Hiragana wa/wi/we/wo # (308F-3092) with a voicing mark (3099), which is # semantically equivalent. However, this is a non- # roundtripping transformation. # 2. The Katakana small ka/ke (30F5,30F6) have no # Hiragana equiavlents. We convert them to normal # Hiragana ka/ke (304B,3051). This is a one-way # information-losing transformation and precludes # round-tripping of 30F5 and 30F6. # 3. The combining marks 3099-309C are in the Hiragana # block, but they apply to Katakana as well, so we # leave them untouched. # 4. The Katakana prolonged sound mark 30FC doubles the # preceding vowel. This is a one-way information- # losing transformation from Katakana to Hiragana. # 5. The Katakana middle dot separates words in foreign # expressions; we leave this unmodified. # The above points preclude successful round-trip # transformations of arbitrary input text. However, # they provide naturalistic results that should conform # to user expectations. # Combining equivalents va/vi/ve/vo わ゙ <> ヷ; ゐ゙ <> ヸ; ゑ゙ <> ヹ; を゙ <> ヺ; # One-to-one mappings, main block # 3041:3094 <> 30A1:30F4 # 309D,E <> 30FD,E ぁ <> ァ; あ <> ア; ぃ <> ィ; い <> イ; ぅ <> ゥ; う <> ウ; ぇ <> ェ; え <> エ; ぉ <> ォ; お <> オ; か <> カ; が <> ガ; き <> キ; ぎ <> ギ; く <> ク; ぐ <> グ; け <> ケ; げ <> ゲ; こ <> コ; ご <> ゴ; さ <> サ; ざ <> ザ; し <> シ; じ <> ジ; す <> ス; ず <> ズ; せ <> セ; ぜ <> ゼ; そ <> ソ; ぞ <> ゾ; た <> タ; だ <> ダ; ち <> チ; ぢ <> ヂ; っ <> ッ; つ <> ツ; づ <> ヅ; て <> テ; で <> デ; と <> ト; ど <> ド; な <> ナ; に <> ニ; ぬ <> ヌ; ね <> ネ; の <> ノ; は <> ハ; ば <> バ; ぱ <> パ; ひ <> ヒ; び <> ビ; ぴ <> ピ; ふ <> フ; ぶ <> ブ; ぷ <> プ; へ <> ヘ; べ <> ベ; ぺ <> ペ; ほ <> ホ; ぼ <> ボ; ぽ <> ポ; ま <> マ; み <> ミ; む <> ム; め <> メ; も <> モ; ゃ <> ャ; や <> ヤ; ゅ <> ュ; ゆ <> ユ; ょ <> ョ; よ <> ヨ; ら <> ラ; り <> リ; る <> ル; れ <> レ; ろ <> ロ; ゎ <> ヮ; わ <> ワ; ゐ <> ヰ; ゑ <> ヱ; を <> ヲ; ん <> ン; ゔ <> ヴ; ゝ <> ヽ; ゞ <> ヾ; # One-way Katakana-Hiragana xform of small K ka/ke to # normal H ka/ke. か < ヵ; け < ヶ; # Katakana followed by a prolonged sound mark 30FC has # its final vowel doubled. This is a Katakana-Hiragana # one-way information-losing transformation. We # include the small Katakana (e.g., small A 3041) and # do not distinguish them from their large # counterparts. It doesn't make sense to double a # small counterpart vowel as a small Hiragana vowel, so # we don't do so. In natural text this should never # occur anyway. If a 30FC is seen without a preceding # vowel sound (e.g., after n 30F3) we do not change it. ### $long = ー; # The following categories are Hiragana, not Katakana # as might be expected, since by the time we get to the # 30FC, the preceding character will have already been # transformed to Hiragana. # {The following mechanically generated from the # Unicode 3.0 data:} $xa = [ \ ぁ あ か が さ ざ \ た だ な は ば ぱ \ ま ゃ や ら ゎ わ \ ]; $xi = [ \ ぃ い き ぎ し じ \ ち ぢ に ひ び ぴ \ み り ゐ \ ]; $xu = [ \ ぅ う く ぐ す ず \ っ つ づ ぬ ふ ぶ \ ぷ む ゅ ゆ る ゔ \ ]; $xe = [ \ ぇ え け げ せ ぜ \ て で ね へ べ ぺ \ め れ ゑ \ ]; $xo = [ \ ぉ お こ ご そ ぞ \ と ど の ほ ぼ ぽ \ も ょ よ ろ を \ ]; あ < $xa {ー}; い < $xi {ー}; う < $xu {ー}; え < $xe {ー}; お < $xo {ー}; :: (NFKC) ; # note: a global filter is more efficient, but MUST include all source chars!! :: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]); # eof