LexingAGreek.hs

Plain text version of LexingAGreek.hs

module PGF.LexingAGreek where                      -- HL 2a1.2.2016
import Data.Char(isSpace)

-- * Text lexing without word capitalization of the first word of every sentence.
-- Greek sentences in (transliterated) texts don't start with capital character. 

-- Ordinary greek text does not have vowel length indicators. We then use '.' as
-- a sentence separator.

lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in w : lext cs
    [] -> []

-- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.

lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in case cs of 
                '.':'.':d:ds | isSpace d 
                  -> (w++['.']) : lext ('.':d:ds)
                '.':d:ds | isAGreekPunct d || isSpace d 
                  -> (w++['.']) : lext (d:ds)
                '.':d:ds | not (isSpace d) 
                  -> case lext (d:ds) of
                       e:es -> (w++['.']++e) : es
                       es -> (w++['.']) : es 
                '.':[] -> (w++['.']) : []
                _ -> w : lext cs      
    [] -> []

unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isAGreekPunct c -> w ++ [c]
    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    [] -> []

isAGreekPunct = flip elem ".,;