#-------------------------------------------------------------------- # Copyright (c) 1999-2004, International Business Machines # Corporation and others. All Rights Reserved. #-------------------------------------------------------------------- # Generally follows UNGEGN # Occasionally deviates in the direction of ISO 233 # a) where required for disambiguation. # b) with underdot instead of cedilla for letter like SAD, since # those are explicitly in Unicode for transliteration. # c) with extra non-Arabic-language letters, like PEH # Does *not* do assimilation of "al", nor hyphenation. # While it could be done, we need to determine whether a prefix "al" could # occur other than as the definite article (since no space is used). :: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ; :: NFKD (NFC); $disambig = ̱ ; $disambig2 = ̰ ; $under = ̣ ; $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; # non-letters ٫ <> '.' $disambig ; # ARABIC DECIMAL SEPARATOR ٬ <> ',' $disambig ; # ARABIC THOUSANDS SEPARATOR # ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate ، <> ',' ; # ARABIC COMMA ؛ <> ';' ; # ARABIC SEMICOLON ؟ <> '?' ; # ARABIC QUESTION MARK ٪ <> '%' ; # ARABIC PERCENT SIGN ۰ <> 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO ۱ <> 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE ۲ <> 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO ۳ <> 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE ۴ <> 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR ۵ <> 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE ۶ <> 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX ۷ <> 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN ۸ <> 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT ۹ <> 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE ٠ <> 0 ; # ARABIC-INDIC DIGIT ZERO ١ <> 1 ; # ARABIC-INDIC DIGIT ONE ٢ <> 2 ; # ARABIC-INDIC DIGIT TWO ٣ <> 3 ; # ARABIC-INDIC DIGIT THREE ٤ <> 4 ; # ARABIC-INDIC DIGIT FOUR ٥ <> 5 ; # ARABIC-INDIC DIGIT FIVE ٦ <> 6 ; # ARABIC-INDIC DIGIT SIX ٧ <> 7 ; # ARABIC-INDIC DIGIT SEVEN ٨ <> 8 ; # ARABIC-INDIC DIGIT EIGHT ٩ <> 9 ; # ARABIC-INDIC DIGIT NINE # letters # long vowels َا<> ā ; # ARABIC FATHA, ARABIC LETTER ALEF ُو <> ū ; # ARABIC DAMMA, ARABIC LETTER WAW ِي <> ī ; # ARABIC KASRA, ARABIC LETTER YEH # longer items moved here to prevent masking ث <> t h $disambig ; # ARABIC LETTER THEH ذ <> d h $disambig ; # ARABIC LETTER THAL ش <> s h $disambig ; # ARABIC LETTER SHEEN ص <> s $under ; # ARABIC LETTER SAD ض <> d $under ; # ARABIC LETTER DAD ط <> t $under ; # ARABIC LETTER TAH ظ <> z $under ; # ARABIC LETTER ZAH غ <> g h $disambig ; # ARABIC LETTER GHAIN # WARNING: special case # will be canonically ordered as # so on the return, we have to skip over (but preserve) the half-ring below (or others like it) # ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS ة <> t \u0308 ; # ARABIC LETTER TEH MARBUTA ة | $1 < t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA # non-Arabic language ژ <> z h $disambig ; # ARABIC LETTER JEH ڭ <> n $disambig g ; # ARABIC LETTER NG ۋ <> v $disambig ; # ARABIC LETTER VE ی <> y $disambig2 ; # ARABIC LETTER FARSI YEH # Arabic language ء <> ʾ ; # ARABIC LETTER HAMZA ا <> a $under; # ARABIC LETTER ALEF ب <> b ; # ARABIC LETTER BEH ت <> t ; # ARABIC LETTER TEH ج <> j ; # ARABIC LETTER JEEM ح <> h $under ; # ARABIC LETTER HAH خ <> k h $disambig ; # ARABIC LETTER KHAH د <> d ; # ARABIC LETTER DAL ر <> r ; # ARABIC LETTER REH ز <> z ; # ARABIC LETTER ZAIN س <> s ; # ARABIC LETTER SEEN ع <> ʿ ; # ARABIC LETTER AIN ـ > ; # ARABIC TATWEEL ف <> f ; # ARABIC LETTER FEH ق <> q ; # ARABIC LETTER QAF ك <> k ; # ARABIC LETTER KAF ل <> l ; # ARABIC LETTER LAM م <> m ; # ARABIC LETTER MEEM ن <> n ; # ARABIC LETTER NOON ه <> h ; # ARABIC LETTER HEH و <> w ; # ARABIC LETTER WAW ى <> y $disambig ; # ARABIC LETTER ALEF MAKSURA ي <> y ; # ARABIC LETTER YEH ً <> aⁿ ; # ARABIC FATHATAN ٌ <> uⁿ ; # ARABIC DAMMATAN ٍ <> iⁿ ; # ARABIC KASRATAN َ <> a ; # ARABIC FATHA ُ <> u ; # ARABIC DAMMA ِ <> i ; # ARABIC KASRA ّ <> ̃ ; # ARABIC SHADDA ْ <> ̊ ; # ARABIC SUKUN # special combining marks ٓ <> ̂ ; # ARABIC MADDAH ABOVE ٔ <> ̉ ; # ARABIC HAMZA ABOVE ٕ <> ̹ ; # ARABIC HAMZA BELOW # Some non-Arabic language (not in UNGEGN) پ <> p ; # ARABIC LETTER PEH چ <> c h $disambig ; # ARABIC LETTER TCHEH ڤ <> v ; # ARABIC LETTER VEH # ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW # ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW گ <> g ; # ARABIC LETTER GAF # fallbacks | s < c } [eiy]; | k < c ; | i < e ; | u < o ; | ks < x ; | n < ‎ⁿ; :: (lower) ; ::NFC (NFD); :: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );