Transliterations.hs

Plain text version of Transliterations.hs

module GF.Text.Transliterations (
  transliterate,
  transliterateWithFile,
  transliteration,
  characterTable,
  transliterationPrintNames
  ) where

import Data.Char
import Numeric
import qualified Data.Map as Map

-- transliterations between ASCII and a Unicode character set

-- current transliterations: devanagari, thai

-- to add a new one: define the Unicode range and the corresponding ASCII strings,
-- which may be one or more characters long

-- conventions to be followed:
--   each character is either [letter] or [letter+nonletters]
--   when using a sparse range of unicodes, mark missing codes as "-" in transliterations
--   characters can be invisible: ignored in translation to unicode

transliterate :: String -> Maybe (String -> String)
transliterate s = case s of
  'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
  't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
  _ -> Nothing

transliterateWithFile :: String -> String -> Bool -> (String -> String)
transliterateWithFile name src isFrom =
  (if isFrom then appTransFromUnicode else appTransToUnicode) (getTransliterationFile name src)

transliteration :: String -> Maybe Transliteration
transliteration s = Map.lookup s allTransliterations

allTransliterations = Map.fromList [
  ("amharic",transAmharic),
  ("ancientgreek", transAncientGreek),
  ("arabic", transArabic),
  ("arabic_unvocalized", transArabicUnvoc),
  ("devanagari", transDevanagari),
  ("greek", transGreek),
  ("hebrew", transHebrew),
  ("persian", transPersian),
  ("sanskrit", transSanskrit),
  ("sindhi", transSindhi),
  ("nepali", transNepali),
  ("telugu", transTelugu),
  ("thai", transThai),
  ("urdu", transUrdu)
 ]

-- used in command options and help
transliterationPrintNames = [(t,printname p) | (t,p) <- Map.toList allTransliterations]

characterTable :: Transliteration -> String
characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
  prOne (i,s) = unwords ["|", showHex i "", "|", [toEnum i], "|", s, "|"]

data Transliteration = Trans {
  trans_to_unicode   :: Map.Map String Int,
  trans_from_unicode :: Map.Map Int String,
  invisible_chars    :: [String],
  printname          :: String
  }

appTransToUnicode :: Transliteration -> String -> String
appTransToUnicode trans =
  concat .
  map (\c -> maybe c (return . toEnum) $
             Map.lookup c (trans_to_unicode trans)
      ) .
  filter (flip notElem (invisible_chars trans)) .
  unchar

appTransFromUnicode :: Transliteration -> String -> String
appTransFromUnicode trans =
  concat .
  map (\c -> maybe [toEnum c] id $
             Map.lookup c (trans_from_unicode trans)
      ) .
  map fromEnum


mkTransliteration :: String -> [String] -> [Int] -> Transliteration
mkTransliteration name ts us =
 Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts)) [] name
  where
    tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
    uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]

getTransliterationFile :: String -> String -> Transliteration
getTransliterationFile name = uncurry (mkTransliteration name) . codes
 where
  codes = unzip . map (mkOne . words) . filter (not . all isSpace) . lines
  mkOne ws = case ws of
    [c]:t:_ -> (t,fromEnum c)  --