#-------------------------------------------------------------------- # Copyright (c) 1999-2004, International Business Machines # Corporation and others. All Rights Reserved. #-------------------------------------------------------------------- # note: a global filter is more efficient, but MUST include all source chars #:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ; # MINIMAL FILTER GENERATED FOR: Latin-Katakana ### WARNING -- must add width filter, both here and below!!! ### :: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ; :: [:Latin:] fullwidth-halfwidth (); :: NFD (NFC); :: Lower (); # whenever transliterating from cased to uncased script, include this # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese # Uses modified Hepburn. Small changes to make unambiguous. # | Kunrei-shiki: Hepburn/MHepburn # | ------------------------------ # | si: shi # | si ~ya: sha # | si ~yu: shu # | si ~yo: sho # | zi: ji # | zi ~ya: ja # | zi ~yu: ju # | zi ~yo: jo # | ti: chi # | ti ~ya: cha # | ti ~yu: chu # | ti ~yu: cho # | tu: tsu # | di: ji/dji # | du: zu/dzu # | hu: fu # | For foreign words: # | ----------------- # | se ~i si # | si ~e she # | # | ze ~i zi # | zi ~e je # | # | te ~i ti # | ti ~e che # | te ~u tu # | # | de ~i di # | de ~u du # | de ~i di # | # | he ~u: hu # | hu ~a fa # | hu ~i fi # | hu ~e he # | hu ~o ho # Most small forms are generated, but if necessary # explicit small forms are given with ~a, ~ya, etc. #------------------------------------------------------ # Variables $vowel = [aeiou] ; $consonant = [bcdfghjklmnpqrstvwxyz] ; $macron = \u0304 ; # Variables used for doubled-consonants with tsu $kana = [\u3041-\u3094] ; $voice = [\u3099\u309B]; $semivoice = [\u309A\u309C]; $k_start = [カキクケコかきくけこ] ; $s_start = [サシスセソさしすせそ] ; $j_start = [シし] $voice ; $t_start = [タチツテトたちつてと] ; $n_start = [ナニヌネノンなにぬねの] ; $h_start = [ハヒヘホはひへほ] ; $f_start = [フふ] ; $m_start = [マミムメモまみむめも] ; $y_start = [ヤユヨやゆよ] ; $r_start = [ラリルレロらりるれろ] ; $w_start = [ワヰヱヲわゐゑを] ; $v_start = [ワヰヱヲ]゙ ; # if ン is followed by $n_quoter, then it needs an # apostrophe after its romaji form to disambiguate it. # e.g., ン ア ! = ナ, so represent as "n'a", not "na". $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; $small_y = [ャィュェョ] ; $iteration = \u309D ; #------------------------------------------------------ # katakana rules # Punctuation '.' <> 。; ',' <> 、; # ' ' } [a-z] > ; # delete spaces before latin # ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana # Iteration Mark # Copy previous letter & marks # TODO # | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration # Specials for katakana -- not shared with hiragana va <> ヷ ; vi <> ヸ ; ve <> ヹ ; vo <> ヺ ; '~ka' <> ヵ ; '~ke' <> ヶ ; # ~~~ begin shared rules ~~~ #special ya < '~'ャ; yi < '~'ィ ; yu < '~'ュ; ye < '~'ェ; yo < '~'ョ; #normal a <> ア ; b | '~' < ヒ ゙} $small_y ; by } $vowel > ビ | '~y' ; ba <> バ ; bi <> ビ ; bu <> ブ ; be <> ベ ; bo <> ボ ; c } i > | s ; c } e > | s ; da <> ダ ; di <> ディ ; du <> デゥ ; de <> デ ; do <> ド ; dzu <> ヅ ; dja < ヂャ ; dji'~i' < ヂィ ; # liu dju < ヂュ ; dje < ヂェ ; djo < ヂョ ; dji <> ヂ ; dj } $vowel > ヂ | '~y' ; # TODO: QUESTION: use ĵĴżŻ instead of dj, dz cha < チャ ; chi'~i' < チィ ; # liu chu < チュ ; che < チェ ; cho < チョ ; chi <> チ ; ch } $vowel > チ | '~y' ; e <> エ ; g | '~' < ギ} $small_y ; gy } $vowel > ギ | '~y' ; ga <> ガ ; gi <> ギ ; gu <> グ ; ge <> ゲ ; go <> ゴ ; i <> イ ; # j } $vowel > ジ | '~y' ; ja <> ジャ ; ji'~i' < ジィ ; # liu ju <> ジュ ; je <> ジェ ; jo <> ジョ ; ji <> ジ ; k | '~' < キ} $small_y ; ky } $vowel > キ | '~y' ; ka <> カ ; ki <> キ ; ku <> ク ; ke <> ケ ; ko <> コ ; m | '~' < ミ} $small_y ; my } $vowel > ミ | '~y' ; ma <> マ ; mi <> ミ ; mu <> ム ; me <> メ ; mo <> モ ; m } [pbfv] > ン ; n | '~' < ニ } $small_y ; ny } $vowel > ニ | '~y' ; na <> ナ ; ni <> ニ ; nu <> ヌ ; ne <> ネ ; no <> ノ ; o <> オ ; p | '~' < ピ } $small_y ; py } $vowel > ピ | '~y' ; pa <> パ ; pi <> ピ ; pu <> プ ; pe <> ペ ; po <> ポ ; h | '~' < ヒ } $small_y ; hy } $vowel > ヒ | '~y' ; ha <> ハ ; hi <> ヒ ; hu <> ヘゥ ; he <> ヘ ; ho <> ホ ; # f | '~' < フ } $small_y ; # f } $vowel > フ | '~' ; fa <> ファ ; fi <> フィ ; fe <> フェ ; fo <> フォ ; fu <> フ ; r | '~' < リ } $small_y ; ry } $vowel > リ | '~y' ; ra <> ラ ; ri <> リ ; ru <> ル ; re <> レ ; ro <> ロ ; za <> ザ ; zi <> ゼィ ; zu <> ズ ; ze <> ゼ ; zo <> ゾ ; sa <> サ ; si <> セィ ; su <> ス ; se <> セ ; so <> ソ ; sha < シャ ; shi'~i' < シィ ; # liu shu < シュ ; she < シェ ; sho < ショ ; shi <> シ ; sh } $vowel > シ | '~y' ; ta <> タ ; ti <> ティ ; tu <> テゥ ; te <> テ ; to <> ト ; tsu <> ツ ; # v } $vowel > ヴ | '~' ; #'v~a' < ヴァ ; # liu #'v~i' < ヴィ ; # liu #'v~e' < ヴェ ; # liu #'v~o' < ヴォ ; # liu vu <> ヴ ; u <> ウ ; # w } $vowel > ウ | '~' ; wa <> ワ ; wi <> ヰ ; wu > ウ ; we <> ヱ ; wo <> ヲ ; ya <> ヤ ; yi > イ ; yu <> ユ ; ye > エ ; yo <> ヨ ; # double consonants #specials s } sh > ッ ; t } ch > ッ ; #voiced j } j <> ッ } $j_start ; b } b <> ッ } [$h_start$f_start] $voice; d } d <> ッ } $t_start $voice; g } g <> ッ } $k_start $voice; p } p <> ッ } [$h_start$f_start] $semivoice; # v } v <> ッ } [ワヰウヱヲう] $voice ; z } z <> ッ } $s_start $voice; v } v <> ッ } $v_start; # normal k } k <> ッ } $k_start ; m } m <> ッ } $m_start ; n } n <> ッ } $n_start ; h } h <> ッ } $h_start ; f } f <> ッ } $f_start ; r } r <> ッ } $r_start ; t } t <> ッ } $t_start ; s } s <> ッ } $s_start ; w } w <> ッ } $w_start; y } y <> ッ } $y_start; # completeness x } x > ッ ; c } k > ッ ; c } c > ッ ; c } q > ッ ; l } l > ッ ; q } q > ッ ; # y } y > ッ ; # w } w > ッ ; # prolonged vowel mark. this indicates a doubling of # the preceding vowel sound #a < a { ー ; # liu #e < e { ー ; # liu #i < i { ー ; # liu #o < o { ー ; # liu #u < u { ー ; # liu $macron <> ー ; # small forms '~a' <> ァ ; '~i' <> ィ ; '~u' <> ゥ ; '~e' <> ェ ; '~o' <> ォ ; '~tsu' <> ッ ; '~wa' <> ヮ ; '~ya' <> ャ ; '~yi' > ィ ; '~yu' <> ュ ; '~ye' > ェ ; '~yo' <> ョ ; # iteration marks # TODO: make more accurate j $1 < sh (y* $vowel) {ヽ$voice ; dj $1 < ch (y* $vowel) {ヽ$voice ; dz $1 < ts (y* $vowel) {ヽ$voice ; g $1 < k (y* $vowel) {ヽ$voice ; z $1 < s (y* $vowel) {ヽ$voice ; d $1 < t (y* $vowel) {ヽ$voice ; h $1 < b (y* $vowel) {ヽ$voice ; v $1 < w (y* $vowel) {ヽ$voice ; sh $1 < sh (y* $vowel) {ヽ$voice ; j $1 < j (y* $vowel) {ヽ$voice ; ch $1 < ch (y* $vowel) {ヽ$voice ; dj $1 < dj(y* $vowel) {ヽ$voice ; ts $1 < ts (y* $vowel) {ヽ$voice ; dz $1 < dz (y* $vowel) {ヽ$voice ; $1 < ($consonant y* $vowel) {ヽ$voice? ; $1 < (.) {ヽ $voice? ; # otherwise repeat last character < ヽ $voice? ; # delete if no characters found # h- rule: lengthens vowel if not followed by a vowel [aeiou] } h > ー ; # one-way latin- > kana rules. these do not occur in # well-formed romaji representing actual japanese text. # their purpose is to make all romaji map to kana of # some sort. # the following are not really necessary, but produce # slightly more natural results. cy > セィ ; dy > ディ ; hy > ヒ ; sy > セィ ; ty > ティ ; zy > ゼィ ; h > ヘ ; # isolated consonants listed here so as not to mask # longer rules above. ch > チ; sh > シ ; dz > ヅ ; dj > ヂ; b > ブ ; d > デ ; g > グ ; k > ク ; m > ム ; n'' < ン } $n_quoter ; n <> ン ; p > プ ; r > ル ; s > ス ; t > テ ; y > イ ; z > ズ ; v > ヴ ; f > フ; j > ジ; w > ウ; ß > | ss ; æ > | e ; ð > | d ; ø > | u ; þ > | th ; # simple substitutions using backup c > | k ; l > | r ; q > | k ; x > | ks ; # ~~~ END shared rules ~~~ #------------------------------------------------------ # Final cleanup '~' > ; # delete stray tildes between letters [:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters # [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use :: NFC (NFD) ; :: ([:Katakana:] halfwidth-fullwidth); # note: a global filter is more efficient, but MUST include all source chars!! #:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]); # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD :: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ; # eof