Arabic_Latin.txt   [plain text]


#--------------------------------------------------------------------
# Copyright (c) 1999-2004, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------

# Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
# Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf>
# a) where required for disambiguation.
# b) with underdot instead of cedilla for letter like SAD, since
#     those are explicitly in Unicode for transliteration.
# c) with extra non-Arabic-language letters, like PEH

# Does *not* do assimilation of "al", nor hyphenation.
# While it could be done, we need to determine whether a prefix "al" could
# occur other than as the definite article (since no space is used).

:: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;
:: NFKD (NFC);
$disambig =  ̱ ; 
$disambig2 =  ̰ ;
$under =  ̣ ;

$notAbove = [[:^ccc=0:]&[:^ccc=230:]];

# non-letters

 ٫ <> '.' $disambig ; # ARABIC DECIMAL SEPARATOR
 ٬ <> ',' $disambig ; # ARABIC THOUSANDS SEPARATOR
#  ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate

، <> ',' ; # ARABIC COMMA
 ؛ <> ';' ; # ARABIC SEMICOLON
 ؟ <> '?' ; # ARABIC QUESTION MARK
 ٪ <> '%' ; # ARABIC PERCENT SIGN

 ۰ <> 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
 ۱ <> 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
 ۲ <> 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
 ۳ <> 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
 ۴ <> 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
 ۵ <> 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
 ۶ <> 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
 ۷ <> 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
 ۸ <> 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
 ۹ <> 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE

 ٠ <> 0 ; # ARABIC-INDIC DIGIT ZERO
 ١ <> 1 ; # ARABIC-INDIC DIGIT ONE
 ٢ <> 2 ; # ARABIC-INDIC DIGIT TWO
 ٣ <> 3 ; # ARABIC-INDIC DIGIT THREE
 ٤ <> 4 ; # ARABIC-INDIC DIGIT FOUR
 ٥ <> 5 ; # ARABIC-INDIC DIGIT FIVE
 ٦ <> 6 ; # ARABIC-INDIC DIGIT SIX
 ٧ <> 7 ; # ARABIC-INDIC DIGIT SEVEN
 ٨ <> 8 ; # ARABIC-INDIC DIGIT EIGHT
 ٩ <> 9 ; # ARABIC-INDIC DIGIT NINE

# letters

# long vowels
  َا<> ā ; # ARABIC FATHA, ARABIC LETTER ALEF
  ُو <> ū ; # ARABIC DAMMA, ARABIC LETTER WAW
 ِي <> ī ; # ARABIC KASRA, ARABIC LETTER YEH

# longer items moved here to prevent masking
 ث <> t h $disambig ; # ARABIC LETTER THEH
 ذ <> d h $disambig ; # ARABIC LETTER THAL
 ش <> s h $disambig ; # ARABIC LETTER SHEEN
 ص <> s $under ; # ARABIC LETTER SAD
 ض <> d $under ; # ARABIC LETTER DAD
 ط <> t $under ; # ARABIC LETTER TAH
 ظ <> z $under ; # ARABIC LETTER ZAH
 غ <> g h $disambig ; # ARABIC LETTER GHAIN

# WARNING: special case
# <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut>
# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
# ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS

 ة <> t \u0308 ; # ARABIC LETTER TEH MARBUTA
 ة | $1 < t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA

# non-Arabic language
 ژ <> z h $disambig ; # ARABIC LETTER JEH
 ڭ <> n $disambig g ; # ARABIC LETTER NG
 ۋ <> v $disambig ; # ARABIC LETTER VE
 ی <> y $disambig2 ; # ARABIC LETTER FARSI YEH

# Arabic language

 ء <> ʾ ; # ARABIC LETTER HAMZA
 ا <> a $under; # ARABIC LETTER ALEF
 ب <> b ; # ARABIC LETTER BEH
 ت <> t ; # ARABIC LETTER TEH
 ج <> j ; # ARABIC LETTER JEEM
 ح <> h $under ; # ARABIC LETTER HAH
 خ <> k h $disambig ; # ARABIC LETTER KHAH
 د <> d ; # ARABIC LETTER DAL
 ر <> r ; # ARABIC LETTER REH
 ز <> z ; # ARABIC LETTER ZAIN
 س <> s ; # ARABIC LETTER SEEN
 ع <> ʿ ; # ARABIC LETTER AIN
  ـ > ; # ARABIC TATWEEL
 ف <> f ; # ARABIC LETTER FEH
 ق <> q ; # ARABIC LETTER QAF
 ك <> k ; # ARABIC LETTER KAF
 ل <> l ; # ARABIC LETTER LAM
 م <> m ; # ARABIC LETTER MEEM
 ن <> n ; # ARABIC LETTER NOON
 ه <> h ; # ARABIC LETTER HEH
 و <> w ; # ARABIC LETTER WAW
 ى <> y $disambig ; # ARABIC LETTER ALEF MAKSURA
 ي <> y ; # ARABIC LETTER YEH
 ً <> aⁿ ; # ARABIC FATHATAN
 ٌ <> uⁿ ; # ARABIC DAMMATAN
 ٍ <> iⁿ ; # ARABIC KASRATAN
 َ <> a ; # ARABIC FATHA
 ُ <> u ; # ARABIC DAMMA
 ِ <> i ; # ARABIC KASRA
 ّ <>   ̃ ; # ARABIC SHADDA
 ْ <>   ̊ ; # ARABIC SUKUN

# special combining marks
 ٓ <>  ̂ ; # ARABIC MADDAH ABOVE
 ٔ <>  ̉ ; # ARABIC HAMZA ABOVE
 ٕ <>  ̹ ; # ARABIC HAMZA BELOW

# Some non-Arabic language (not in UNGEGN)
 پ <> p ; # ARABIC LETTER PEH
 چ <> c h $disambig ; # ARABIC LETTER TCHEH
 ڤ <> v ; # ARABIC LETTER VEH
# ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
# ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
 گ <> g ; # ARABIC LETTER GAF

# fallbacks
| s < c } [eiy];
| k < c ;
| i < e ;
| u < o ;
| ks < x ;
| n < ‎ⁿ;

:: (lower) ;
::NFC (NFD);
:: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );