#-------------------------------------------------------------------- # Copyright (c) 1999-2004, International Business Machines # Corporation and others. All Rights Reserved. #-------------------------------------------------------------------- # Thai-Latin # This set of rules follows ISO 11940 # see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf # except that that does not mention an implicit vowel, so we use ọ # # The transcription is fairly ugly, so we ought to also do the UNGEGN version # see: http://www.eki.ee/wgrs/rom1_th.pdf # and probably make that the main variant. # Note: this is an internal file. The NFD/NFC is handled externally, in the index # The insertion of spaces between words, the reversal of the vowels # and the conversion of space to semicolon are done *outside* of these rules. # So as far as these rules are concerned, the vowels are in logical order! # insert implicit vowel (and remove it going the other way) # COMMENTED out: the implicit vowel positions cannot be predicted algorithmically #$consonant = [ก-ฮ]; #$vowel = [ะ-ฺเ-ไ็]; #{ ( $consonant ) } [^$vowel ] > | $1  ; # > ọ ; # < ọ ; $notAbove = [^\p{ccc=0}\p{ccc=above}] ; $notBelow = [^\p{ccc=0}\p{ccc=below}] ; # Consonants # Warning: the 'h's need to be handled carefully! # What we really want to say is the following, but we can't # $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ; # Since the only accents we care about that could cause problems are free-standing accents below, we use instead: $freeStandingBelow = [\u0325 ]; $hAccent = [ ̄ ̣]; $notHAccent0 = [^$freeStandingBelow$hAccent]; $notHAccent1 = $freeStandingBelow [^$hAccent]; ห > h̄ ; # THAI CHARACTER HO HIP ห | $1 < h ($notAbove*) ̄; # backward case, account for reordering ฮ <> ḥ ; # THAI CHARACTER HO NOKHUK ข <> k̄h ; # THAI CHARACTER KHO KHAI ฃ <> ḳ̄h ; # THAI CHARACTER KHO KHUAT ฅ <> kʹh ; # THAI CHARACTER KHO KHON ฆ <> ḳh ; # THAI CHARACTER KHO RAKHANG ค < kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI ค <> kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI ก <> k ; # THAI CHARACTER KO KAI ภ <> p̣h ; # THAI CHARACTER PHO SAMPHAO ผ <> p̄h ; # THAI CHARACTER PHO PHUNG พ < ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN พ <> ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN ป <> p ; # THAI CHARACTER PO PLA ฉ <> c̄h ; # THAI CHARACTER CHO CHING ฌ <> c̣h ; # THAI CHARACTER CHO CHOE ช < ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG ช <> ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG จ <> c ; # THAI CHARACTER CHO CHAN ฐ <> ṭ̄h ; # THAI CHARACTER THO THAN ฑ <> ṯh ; # THAI CHARACTER THO NANGMONTHO ฒ <> tʹh ; # THAI CHARACTER THO PHUTHAO ถ <> t̄h ; # THAI CHARACTER THO THUNG ธ <> ṭh ; # THAI CHARACTER THO THONG ท < th } $notHAccent1 ; # THAI CHARACTER THO THAHAN ท <> th } $notHAccent0 ; # THAI CHARACTER THO THAHAN #Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick. ฏ <> t̩ ; # THAI CHARACTER TO PATAK ต <> t ; # THAI CHARACTER TO TAO # since there is no singleton g (generated), don't worry about that. ง <> ng ; # THAI CHARACTER NGO NGU ณ <> ṇ ; # THAI CHARACTER NO NEN น <> n ; # THAI CHARACTER NO NU ญ <> ỵ ; # THAI CHARACTER YO YING ฎ <> ḍ ; # THAI CHARACTER DO CHADA ด <> d ; # THAI CHARACTER DO DEK บ <> b ; # THAI CHARACTER BO BAIMAI ฝ <> f̄ ; # THAI CHARACTER FO FA ฝ | $1 < f ($notAbove*) ̄; # backward case, account for reordering ม <> m ; # THAI CHARACTER MO MA ย <> y ; # THAI CHARACTER YO YAK ร <> r ; # THAI CHARACTER RO RUA ฤ <> v ; # THAI CHARACTER RU ฦ <> ł ; # THAI CHARACTER LU ว <> w ; # THAI CHARACTER WO WAEN ศ <> ṣ̄ ; # THAI CHARACTER SO SALA*** ศ | $1 < s ̣ ($notAbove*) ̄; # backward case, account for reordering ษ <> s̄ʹ ; # THAI CHARACTER SO RUSI ส > s̄ ; # THAI CHARACTER SO SUA*** ส | $1 < s ($notAbove*) ̄; # backward case, account for reordering ฬ <> ḷ ; # THAI CHARACTER LO CHULA ล <> l ; # THAI CHARACTER LO LING ฟ <> f ; # THAI CHARACTER FO FAN อ <> x ; # THAI CHARACTER O ANG ซ <> s ; # THAI CHARACTER SO SO # vowels ั <> ạ ; # THAI CHARACTER MAI HAN-AKAT า > ā ; # THAI CHARACTER SARA AA า | $1 < a ($notAbove*) ̄; # backward case, account for reordering # We deviate from ISO for SARA AM for disambiguation ำ > a ̉; # THAI CHARACTER SARA AM ำ | $1 < a ($notAbove*) ̉ ; # backward case, account for reordering ะ <> a ; # THAI CHARACTER SARA A ี <> ī ; # THAI CHARACTER SARA II ี | $1 < i ($notAbove*) ̄ ; # backward case, account for reordering ื <> ụ̄ ; # THAI CHARACTER SARA UEE ื | $1 < u ̣ ($notAbove*) ̄ ; # backward case, account for reordering ึ <> ụ ; # THAI CHARACTER SARA UE ู <> ū ; # THAI CHARACTER SARA UU ู | $1 < u ($notAbove*) ̄ ; # backward case, account for reordering ุ <> u ; # THAI CHARACTER SARA U ฯ <> ‡ ; # THAI CHARACTER PAIYANNOI # ฿ <> XXX ; # THAI CURRENCY SYMBOL BAHT เ <> e ; # THAI CHARACTER SARA E แ <> æ ; # THAI CHARACTER SARA AE โ <> o ; # THAI CHARACTER SARA O ใ <> ı ; # THAI CHARACTER SARA AI MAIMUAN ไ <> ị ; # THAI CHARACTER SARA AI MAIMALAI ๅ <> ɨ ; # THAI CHARACTER LAKKHANGYAO ็ <> ̆ ; # THAI CHARACTER MAITAIKHU ่ <> ̀ ; # THAI CHARACTER MAI EK ้ <> ̂ ; # THAI CHARACTER MAI THO ๊ <> ́ ; # THAI CHARACTER MAI TRI ๋ <> ̌ ; # THAI CHARACTER MAI CHATTAWA ์ <> ̒ ; # THAI CHARACTER THANTHAKHAT ๎ <> '~' ; # THAI CHARACTER YAMAKKAN # We deviate from ISO for disambiguation ํ <> ̊ ; # THAI CHARACTER NIKHAHIT ๏ <> § ; # THAI CHARACTER FONGMAN ๐ <> 0 ; # THAI DIGIT ZERO ๑ <> 1 ; # THAI DIGIT ONE ๒ <> 2 ; # THAI DIGIT TWO ๓ <> 3 ; # THAI DIGIT THREE ๔ <> 4 ; # THAI DIGIT FOUR ๕ <> 5 ; # THAI DIGIT FIVE ๖ <> 6 ; # THAI DIGIT SIX ๗ <> 7 ; # THAI DIGIT SEVEN ๘ <> 8 ; # THAI DIGIT EIGHT ๙ <> 9 ; # THAI DIGIT NINE ๚ <> '||' ; # THAI CHARACTER ANGKHANKHU ๛ <> » ; # THAI CHARACTER KHOMUT ๆ <> « ; # THAI CHARACTER MAIYAMOK # moved down to make shorter first #Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below. ฺ <> ˌ ; # THAI CHARACTER PHINTHU ิ <> i ; # THAI CHARACTER SARA I # fallbacks | k < g ; | k < h ; | c < j ; | k < q ; | s < z ; :: (lower);