Greek_Latin_UNGEGN.txt   [plain text]


#--------------------------------------------------------------------
# Copyright (c) 1999-2004, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# For modern Greek, based on UNGEGN rules.

# Rules are predicated on running NFD first, and NFC afterwards
# MINIMAL FILTER GENERATED FOR: Greek-Latin/UNGEGN
# WARNING: need to add accents to both filters ###
# :: [́̄̆̈;µ·ÀÂÈÊÌÎÒÔÙÛàâèêìîòôùûĈ-ĉĜ-ĝĤ-ĥĴ-ĵŜ-ŝŴ-ŷǛ-ǜǸ-ǹ̀̂̓-̔̀͂-̓ͅͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϖϰ-ϵЀЍѐѝḔ-ḕṐ-ṑẀ-ẁẐ-ẑẤ-ậẰ-ằẾ-ệỐ-ộỜ-ờỪ-ừỲ-ỳἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-῍῏-ΐῖ-Ί῝῟-῭ῲ-ῴῶ-ῼΩ\u03F7-\u07FB\u03F9] ;

:: [[[:Greek:][:Mn:][:Me:]] [\:-;?\u00B7\u037E\u0387]] ;
::NFD (NFC) ;

# Useful variables

$lower = [[:latin:][:greek:] & [:Ll:]] ;
$upper = [[:latin:][:greek:] & [:Lu:]] ;
$accent = [[:Mn:][:Me:]] ;

$macron = ̄ ;
$ddot = ̈ ;

$lcgvowel = [αεηιουω] ;
$ucgvowel = [ΑΕΗΙΟΥΩ] ;
$gvowel = [$lcgvowel $ucgvowel] ;
$lcgvowelC = [$lcgvowel $accent] ;

$evowel = [aeiouyAEIOUY];
$vowel = [ $evowel $gvowel] ;

$beforeLower = $accent * $lower ;

$gammaLike = [ΓΚΞΧγκξχϰ] ;
$egammaLike = [GKXCgkxc] ;
$smooth = ̓ ;
$rough = ̔ ;
$iotasub = ͅ ;

$softener = [βΒγΓδΔζΖλΛμΜνΝρΡ$gvowel] ;

$under = ̱;

$caron = ̌;

$afterLetter = [:L:] [\'$accent]* ;
$beforeLetter = [\'$accent]* [:L:] ;

# Fix punctuation

# preserve orginal
\: <> \: $under ;
\? <> \? $under ;

\; <> \? ;
· <> \: ;

# Fix any ancient characters that creep in

͂ > ́ ;
̂ > ́ ;
̀ > ́ ;
$smooth > ;
$rough > ;
$iotasub > ;
ͺ > ;

# need to have these up here so the rules don't mask

η <> i $under ;
Η <> I $under ;

Ψ } $beforeLower <> Ps ;
Ψ <> PS ;
ψ <> ps ;

ω <> o $under ;
Ω <>  O $under;

# at begining or end of word, convert mp to b

[^[:L:]$accent] { μπ > b ;
μπ } [^[:L:]$accent] > b ;
[^[:L:]$accent] { [Μμ][Ππ] > B ;
[Μμ][Ππ] } [^[:L:]$accent] > B ;

μπ < b ;
Μπ < B } $beforeLower ;
ΜΠ < B ;

# handle diphthongs ending with upsilon

ου <> ou ;
ΟΥ <> OU ;
Ου <> Ou ;
οΥ <> oU ;

$fmaker = [aeiAEI] $under ? ;
$shiftForwardVowels = [[:Mn:]-[\u0308]]; # note: a diaeresis keeps the items separate

$fmaker { υ ( $shiftForwardVowels )* } $softener > $1 v $under ;
υ $1 < ( $shiftForwardVowels )* v $under ;

$fmaker { υ ( $shiftForwardVowels )* } > $1 f $under;
υ $1 < ( $shiftForwardVowels )* f $under ;

$fmaker { Υ } $softener <> V $under ;
$fmaker { Υ <> U $under ;

υ <> y ;
Υ <> Y ;

# NORMAL

α <> a ;
Α <> A ;

β <> v ;
Β <> V ;

γ } $gammaLike <> n } $egammaLike ;
γ <> g ;
Γ } $gammaLike <> N } $egammaLike ;
Γ <> G ;

δ <> d ;
Δ <> D ;

ε <> e ;
Ε <> E ;

ζ <> z ;
Ζ <> Z ;

θ <> th ;
Θ } $beforeLower <> Th ;
Θ <> TH ;

ι <> i ;
Ι <> I ;

κ <> k ;
Κ <> K ;

λ <> l ;
Λ <> L ;

μ <> m ;
Μ <> M ;

ν } $gammaLike > n\' ;
ν <> n ;
Ν } $gammaLike <> N\' ;
Ν <> N ;

ξ <> x ;
Ξ <> X ;

ο <> o ;
Ο <> O ;

π <> p ;
Π <> P ;

ρ <> r ;
Ρ <> R ;

# insert separator before things that turn into s
[Pp] { } [ςσΣϷϸϺϻ] > \' ; 

# special S variants

Ϸ <> Š ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L 
ϸ <> š ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L 
Ϻ <> Ŝ ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L 
ϻ <> ŝ ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L 

# Caron means exception

# before a letter, initial
ς } $beforeLetter <> s $under } $beforeLetter;
σ } $beforeLetter <> s } $beforeLetter;

# otherwise, after a letter = final
$afterLetter { σ <> $afterLetter { s $under;
$afterLetter { ς <> $afterLetter { s ;

# otherwise (isolated) = initial
ς <> s $under;
σ <> s ;

# [Pp] { Σ <> \'S ;
Σ <> S ;

τ <> t ;
Τ <> T ;

φ <> f ;
Φ <> F ;

χ <> ch ;
Χ } $beforeLower <> Ch ;
Χ <> CH ;

# Completeness for ASCII

# $ignore = [[:Mark:]''] * ;

| ch < h ;
| k  < c ;
| i  < j ;
| k < q ;
| b < u } $vowel ;
| b < w } $vowel ;
| y < u ;
| y < w ;

| Ch < H ;
| K < C ;
| I < J ;
| K < Q ;
| B < W } $vowel ;
| B < U } $vowel ;
| Y < W ;
| Y < U ;

# Completeness for Greek

ϐ > | β ;
ϑ > | θ ;
ϒ > | Υ ;
ϕ > | φ ;
ϖ > | π ;

ϰ > | κ ;
ϱ > | ρ ;
ϲ > | σ ;
Ϲ > | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL
ϳ > j ;
ϴ > | Θ ;
ϵ > | ε ;
µ > | μ ;

# delete any trailing ' marks used for roundtripping

 < [Ππ] { \' } [Ss] ;
 < [Νν] { \' } $egammaLike ;

::NFC (NFD) ;

# MINIMAL FILTER GENERATED FOR: Latin-Greek/UNGEGN BACKWARD
:: ([[[:Latin:][:Mn:][:Me:]] ['\:?]]) ;