t_Arab_Latn.txt   [plain text]


 // -*- Coding: utf-8; -*-
//--------------------------------------------------------------------
// Copyright (c) 1999-2002, International Business Machines
// Corporation and others.  All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
// Tool: dumpicurules.bat
// Source: ../../../impl/data/Transliterator_Arabic_Latin.txt
// Date: Sat Jul 27 10:31:01 2002
//--------------------------------------------------------------------

// Arabic_Latin

t_Arab_Latn {
  Rule {
//--------------------------------------------------------------------
//--------------------------------------------------------------------
//--------------------------------------------------------------------

// Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
// Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf>
// a) where required for disambiguation.
// b) with underdot instead of cedilla for letter like SAD, since
//     those are explicitly in Unicode for transliteration.
// c) with extra non-Arabic-language letters, like PEH

// Does *not* do assimilation of "al", nor hyphenation.
// While it could be done, we need to determine whether a prefix "al" could
// occur other than as the definite article (since no space is used).

":: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;"
":: NFKD (NFC);"
"$disambig =  ̱ ;" 
"$disambig2 =  ̰ ;"
"$under =  ̣ ;"

"$notAbove = [[:^ccc=0:]&[:^ccc=230:]];"

// non-letters

 "٫ <> '.' $disambig ;" // ARABIC DECIMAL SEPARATOR
 "٬ <> ',' $disambig ;" // ARABIC THOUSANDS SEPARATOR
//  ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate

"، <> ',' ;" // ARABIC COMMA
 "؛ <> ';' ;" // ARABIC SEMICOLON
 "؟ <> '?' ;" // ARABIC QUESTION MARK
 "٪ <> '%' ;" // ARABIC PERCENT SIGN

 "۰ <> 0 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT ZERO
 "۱ <> 1 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT ONE
 "۲ <> 2 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT TWO
 "۳ <> 3 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT THREE
 "۴ <> 4 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT FOUR
 "۵ <> 5 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT FIVE
 "۶ <> 6 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT SIX
 "۷ <> 7 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT SEVEN
 "۸ <> 8 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT EIGHT
 "۹ <> 9 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT NINE

 "٠ <> 0 ;" // ARABIC-INDIC DIGIT ZERO
 "١ <> 1 ;" // ARABIC-INDIC DIGIT ONE
 "٢ <> 2 ;" // ARABIC-INDIC DIGIT TWO
 "٣ <> 3 ;" // ARABIC-INDIC DIGIT THREE
 "٤ <> 4 ;" // ARABIC-INDIC DIGIT FOUR
 "٥ <> 5 ;" // ARABIC-INDIC DIGIT FIVE
 "٦ <> 6 ;" // ARABIC-INDIC DIGIT SIX
 "٧ <> 7 ;" // ARABIC-INDIC DIGIT SEVEN
 "٨ <> 8 ;" // ARABIC-INDIC DIGIT EIGHT
 "٩ <> 9 ;" // ARABIC-INDIC DIGIT NINE

// letters

// long vowels
  "َا<> ā ;" // ARABIC FATHA, ARABIC LETTER ALEF
  "ُو <> ū ;" // ARABIC DAMMA, ARABIC LETTER WAW
 "ِي <> ī ;" // ARABIC KASRA, ARABIC LETTER YEH

// longer items moved here to prevent masking
 "ث <> t h $disambig ;" // ARABIC LETTER THEH
 "ذ <> d h $disambig ;" // ARABIC LETTER THAL
 "ش <> s h $disambig ;" // ARABIC LETTER SHEEN
 "ص <> s $under ;" // ARABIC LETTER SAD
 "ض <> d $under ;" // ARABIC LETTER DAD
 "ط <> t $under ;" // ARABIC LETTER TAH
 "ظ <> z $under ;" // ARABIC LETTER ZAH
 "غ <> g h $disambig ;" // ARABIC LETTER GHAIN

// WARNING: special case
// <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut>
// so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
// ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS

 "ة <> t \u0308 ;" // ARABIC LETTER TEH MARBUTA
 "ة | $1 < t ($notAbove+) \u0308 ;" // ARABIC LETTER TEH MARBUTA

// non-Arabic language
 "ژ <> z h $disambig ;" // ARABIC LETTER JEH
 "ڭ <> n $disambig g ;" // ARABIC LETTER NG
 "ۋ <> v $disambig ;" // ARABIC LETTER VE
 "ی <> y $disambig2 ;" // ARABIC LETTER FARSI YEH

// Arabic language

 "ء <> ʾ ;" // ARABIC LETTER HAMZA
 "ا <> a $under;" // ARABIC LETTER ALEF
 "ب <> b ;" // ARABIC LETTER BEH
 "ت <> t ;" // ARABIC LETTER TEH
 "ج <> j ;" // ARABIC LETTER JEEM
 "ح <> h $under ;" // ARABIC LETTER HAH
 "خ <> k h $disambig ;" // ARABIC LETTER KHAH
 "د <> d ;" // ARABIC LETTER DAL
 "ر <> r ;" // ARABIC LETTER REH
 "ز <> z ;" // ARABIC LETTER ZAIN
 "س <> s ;" // ARABIC LETTER SEEN
 "ع <> ʿ ;" // ARABIC LETTER AIN
  "ـ > ;" // ARABIC TATWEEL
 "ف <> f ;" // ARABIC LETTER FEH
 "ق <> q ;" // ARABIC LETTER QAF
 "ك <> k ;" // ARABIC LETTER KAF
 "ل <> l ;" // ARABIC LETTER LAM
 "م <> m ;" // ARABIC LETTER MEEM
 "ن <> n ;" // ARABIC LETTER NOON
 "ه <> h ;" // ARABIC LETTER HEH
 "و <> w ;" // ARABIC LETTER WAW
 "ى <> y $disambig ;" // ARABIC LETTER ALEF MAKSURA
 "ي <> y ;" // ARABIC LETTER YEH
 "ً <> aⁿ ;" // ARABIC FATHATAN
 "ٌ <> uⁿ ;" // ARABIC DAMMATAN
 "ٍ <> iⁿ ;" // ARABIC KASRATAN
 "َ <> a ;" // ARABIC FATHA
 "ُ <> u ;" // ARABIC DAMMA
 "ِ <> i ;" // ARABIC KASRA
 "ّ <>   ̃ ;" // ARABIC SHADDA
 "ْ <>   ̊ ;" // ARABIC SUKUN

// special combining marks
 "ٓ <>  ̂ ;" // ARABIC MADDAH ABOVE
 "ٔ <>  ̉ ;" // ARABIC HAMZA ABOVE
 "ٕ <>  ̹ ;" // ARABIC HAMZA BELOW

// Some non-Arabic language (not in UNGEGN)
 "پ <> p ;" // ARABIC LETTER PEH
 "چ <> c h $disambig ;" // ARABIC LETTER TCHEH
 "ڤ <> v ;" // ARABIC LETTER VEH
// ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
// ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
 "گ <> g ;" // ARABIC LETTER GAF

// fallbacks
"| s < c } [eiy];"
"| k < c ;"
"| i < e ;"
"| u < o ;"
"| ks < x ;"
"| n < ‎ⁿ;"

":: (lower) ;"
"::NFC (NFD);"
":: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );"
  }
}