-- | Lexers and unlexers - they work on space-separated word strings module GF.Text.Lexing (stringOp,opInEnv,bindTok) where import GF.Text.Transliterations import Data.Char (isSpace,toUpper,toLower) import Data.List (intersperse) stringOp :: (String -> Bool) -> String -> Maybe (String -> String) stringOp good name = case name of "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) "lextext" -> Just $ appLexer (lexText good) "lexcode" -> Just $ appLexer lexCode "lexmixed" -> Just $ appLexer (lexMixed good) "lexgreek" -> Just $ appLexer lexAGreek "lexgreek2" -> Just $ appLexer lexAGreek2 "words" -> Just $ appLexer words "bind" -> Just $ appUnlexer (unwords . bindTok) "unchars" -> Just $ appUnlexer concat "unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok) "unlexcode" -> Just $ appUnlexer unlexCode "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok) "unlexgreek" -> Just $ appUnlexer unlexAGreek "unlexnone" -> Just id "unlexid" -> Just id "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML _ -> transliterate name -- perform op in environments beg--end, t.ex. between "--" --- suboptimal implementation opInEnv :: String -> String -> (String -> String) -> (String -> String) opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where chop mk@(lg, mark) s0 s = let (tag,rest) = splitAt lg s in if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest else case s of c:cs -> chop mk (c:s0) cs [] -> [reverse s0] switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg) (lbeg,lend) = (length beg, length end) altern m ts = case ts of t:ws | not m && t==beg -> t : altern True ws t:ws | m && t==end -> t : altern False ws t:ws -> (if m then op t else t) : altern m ws [] -> [] appLexer :: (String -> [String]) -> String -> String appLexer f = unwords . filter (not . null) . f appUnlexer :: ([String] -> String) -> String -> String ----appUnlexer f = unlines . map (f . words) . lines appUnlexer f = f . words wrapHTML :: String -> String wrapHTML = unlines . tag . intersperse "
" . lines where tag ss = "":"":"":"":"" : ss ++ ["",""] -- * Text lexing -- | Text lexing with standard word capitalization of the first word of every sentence lexText :: (String -> Bool) -> String -> [String] lexText good = lexText' (uncapitInit good) -- | Text lexing with custom treatment of the first word of every sentence. lexText' :: (String->String) -> String -> [String] lexText' uncap1 = uncap . lext where lext s = case s of c:cs | isMajorPunct c -> [c] : uncap (lext cs) c:cs | isMinorPunct c -> [c] : lext cs c:cs | isSpace c -> lext cs _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs _ -> [s] uncap s = case s of w:ws -> uncap1 w:ws _ -> s unlexText :: [String] -> String unlexText = capitInit . unlext where unlext s = case s of w:[] -> w w:[c]:[] | isPunct c -> w ++ [c] w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs w:ws -> w ++ " " ++ unlext ws _ -> [] -- | Bind tokens separated by Prelude.BIND, i.e. &+ bindTok :: [String] -> [String] bindTok ws = case ws of w1:"&+":w2:ws -> bindTok ((w1++w2):ws) "&+":ws -> bindTok ws "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws) "&|":ws -> bindTok ws w:ws -> w:bindTok ws [] -> [] -- * Code lexing -- | Haskell lexer, usable for much code lexCode :: String -> [String] lexCode ss = case lex ss of [(w@(_:_),ws)] -> w : lexCode ws _ -> [] -- * Ancient Greek lexing lexTextAGreek :: String -> [String] lexTextAGreek s = lext s where lext s = case s of c:cs | isAGreekPunct c -> [c] : (lext cs) c:cs | isSpace c -> lext cs _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s in w : lext cs [] -> [] -- Philological greek text may use vowel length indicators. Then '.' is not a sentence -- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c. lexTextAGreek2 :: String -> [String] lexTextAGreek2 s = lext s where lext s = case s of c:cs | isAGreekPunct c -> [c] : (lext cs) c:cs | isSpace c -> lext cs _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s in case cs of '.':'.':d:ds | isSpace d -> (w++['.']) : lext ('.':d:ds) '.':d:ds | isAGreekPunct d || isSpace d -> (w++['.']) : lext (d:ds) '.':d:ds | not (isSpace d) -> case lext (d:ds) of e:es -> (w++['.']++e) : es es -> (w++['.']) : es '.':[] -> (w++['.']) : [] _ -> w : lext cs [] -> [] unlexTextAGreek :: [String] -> String unlexTextAGreek = unlext where unlext s = case s of w:[] -> w w:[c]:[] | isAGreekPunct c -> w ++ [c] w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs w:ws -> w ++ " " ++ unlext ws [] -> [] isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset, -- second version · = 00B7 standard code point -- * Text lexing and unlexing for Ancient Greek: -- 1. no capitalization of initial word, -- 2. grave/acute accent switch on final syllables of words not followed by punctuation, -- 3. accent move from/to support word to/from following clitic words (iterated). lexAGreek :: String -> [String] lexAGreek = fromAGreek . lexTextAGreek lexAGreek2 :: String -> [String] lexAGreek2 = fromAGreek . lexTextAGreek2 unlexAGreek :: [String] -> String unlexAGreek = unlexTextAGreek . toAGreek -- Note: unlexAGreek does not glue punctuation with the previous word, so that short -- vowel indication (like a.) differs from sentence end (a .). -- | normalize = change grave accent on sentence internal words to acute, -- and shift inherited acutes to the following enclitic (where they are -- visible only as shown in the list of enclitics above) normalize :: String -> String normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) fromAGreek :: [String] -> [String] fromAGreek s = case s of w:[]:vs -> w:[]:(fromAGreek vs) w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs) w:v:vs | wasEnclitic v && wasEnclitic w -> getEnclitic w : fromAGreek (v:vs) w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*" getProclitic w : getEnclitic v : fromAGreek vs w:v:vs | wasEnclitic v && (hasEndCircum w || (hasEndAcute w && hasSingleAccent w)) -> w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*" w:v:vs | wasEnclitic v && hasPrefinalAcute w -> w : getEnclitic v : fromAGreek vs w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*" dropLastAccent w : getEnclitic v : fromAGreek vs w:v:vs | wasEnclitic w -> getEnclitic w : fromAGreek (v:vs) w:ws -> (toAcute w) : (fromAGreek ws) ws -> ws -- | de-normalize = change acute accent of end syllables in sentence internal -- (non-enclitic) words to grave accent, and move accents of enclitics to the -- previous word to produce ordinary ancient greek denormalize :: String -> String denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) toAGreek :: [String] -> [String] toAGreek s = case s of w:[]:vs -> w:[]:(toAGreek vs) w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek w:v:vs | isEnclitic v && isEnclitic w -> addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2 w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta addAcute w: (toAGreek (dropAccent v:vs)) w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta w:v:vs | isEnclitic v && hasPrefinalAcute w -> w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b. w:v:vs | isEnclitic v -> (addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma w:v:vs | isEnclitic w -> w:(toAGreek (v:vs)) w:ws -> (toGrave w) : (toAGreek ws) ws -> ws -- | Change accent on the final syllable of a word toGrave :: String -> String toGrave = reverse . grave . reverse where grave s = case s of '\'':cs -> '`':cs c:cs | isAGreekVowel c -> c:cs c:cs -> c: grave cs _ -> s toAcute :: String -> String toAcute = reverse . acute . reverse where acute s = case s of '`':cs -> '\'':cs c:cs | isAGreekVowel c -> c:cs c:cs -> c: acute cs _ -> s isAGreekVowel = flip elem "aeioyhw" -- | Accent moves for enclitics and proclitics (atona) enclitics = [ "moy","moi","me", -- personal pronouns "soy","soi","se", "oy(","oi(","e(", "tis*","ti","tina'", -- indefinite pronoun "tino's*","tini'", "tine's*","tina's*", "tinw~n","tisi'","tisi'n", "poy","poi", -- indefinite adverbs "pove'n","pws*", "ph|","pote'", "ge","te","toi", -- particles "nyn","per","pw" -- suffix -"de" -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~) ] -- and more, BR 11 proclitics = [ "o(","h(","oi(","ai(", -- articles "e)n","ei)s*","e)x","e)k", -- prepositions "ei)","w(s*", -- conjunctions "oy)","oy)k","oy)c" -- negation ] isEnclitic = flip elem enclitics isProclitic = flip elem proclitics -- Check if a word is an enclitic or accented enclitic and extract the enclitic wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) ++ (map dropAccent (filter hasAccent enclitics)) accented = (filter hasAccent enclitics) ++ map addAcute (filter (not . hasAccent) enclitics) in flip elem (accented ++ unaccented) wasProclitic = flip elem (map addAcute proclitics) getEnclitic = let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics)) ++ (map addAcute (filter (not . hasAccent) enclitics))) (enclitics ++ (filter hasAccent enclitics) ++ (filter (not . hasAccent) enclitics)) find = \v -> lookup v pairs in \v -> case (find v) of Just x -> x _ -> v getProclitic = let pairs = zip (map addAcute proclitics) proclitics find = \v -> lookup v pairs in \v -> case (find v) of Just x -> x _ -> v -- | Accent manipulation dropAccent = reverse . drop . reverse where drop s = case s of [] -> [] '\'':cs -> cs '`':cs -> cs '~':cs -> cs c:cs -> c:drop cs dropLastAccent = reverse . drop . reverse where drop s = case s of [] -> [] '\'':cs -> cs '`':cs -> cs '~':cs -> cs c:cs -> c:drop cs addAcute :: String -> String addAcute = reverse . acu . reverse where acu w = case w of c:cs | c == '\'' -> c:cs c:cs | c == '(' -> '\'':c:cs c:cs | c == ')' -> '\'':c:cs c:cs | isAGreekVowel c -> '\'':c:cs c:cs -> c : acu cs _ -> w -- | Accent checking on end syllables hasEndAcute = find . reverse where find s = case s of [] -> False '\'':cs -> True '`':cs -> False '~':cs -> False c:cs | isAGreekVowel c -> False _:cs -> find cs hasEndCircum = find . reverse where find s = case s of [] -> False '\'':cs -> False '`':cs -> False '~':cs -> True c:cs | isAGreekVowel c -> False _:cs -> find cs hasPrefinalAcute = find . reverse where find s = case s of [] -> False '\'':cs -> False -- final acute '`':cs -> False '~':cs -> False c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs c:cs | isAGreekVowel c -> findNext cs _:cs -> find cs where findNext s = case s of [] -> False '\'':cs -> True -- prefinal acute '`':cs -> False '~':cs -> False c:cs | isAGreekVowel c -> False _:cs -> findNext cs where hasSingleAccent v = hasAccent v && not (hasAccent (dropLastAccent v)) hasAccent v = case v of [] -> False c:cs -> elem c ['\'','`','~'] || hasAccent cs {- Tests: -- denormalization. Examples in BR 11 work: -} enclitics_expls = -- normalized "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha "sofo's tis*":"sofoi' tine's*": -- a.beta "ei) tis*":"ei) tine's*": "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma "doy~los* tis*":"doy~loi tine's*": "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b. "ei) poy tis* tina' i)'doi": -- Anm. 2. [] unlexCode :: [String] -> String unlexCode s = case s of w:[] -> w [c]:cs | isParen c -> [c] ++ unlexCode cs w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs w:ws -> w ++ " " ++ unlexCode ws _ -> [] -- | LaTeX lexer in the math mode: \ should not be separated from the next word lexLatexCode :: String -> [String] lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex restoreBackslash ws = case ws of "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww w:ww -> w:restoreBackslash ww _ -> ws -- * Mixed lexing -- | LaTeX style lexer, with "math" environment using Code between $...$ lexMixed :: (String -> Bool) -> String -> [String] lexMixed good = concat . alternate False [] where alternate env t s = case s of '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs c:cs -> alternate env (c:t) cs _ -> [lex env (reverse t)] lex env = if env then lexLatexCode else lexText good unlexMixed :: (String -> Bool) -> [String] -> String unlexMixed good = capitInit . concat . alternate False where alternate env s = case s of _:_ -> case break (flip elem ["$","\\[","\\]","\$","\$"]) s of (t,[]) -> unlex env t : [] (t,c:m) -> unlex env t : sep env c m : alternate (not env) m _ -> [] unlex env = if env then unlexCode else (uncapitInit good . unlexText) sep env c m = case (m,env) of ([p]:_,True) | isPunct p -> c -- closing $ glued to next punct (_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows _ -> " " ++ c -- put space before opening $ -- * Additional lexing uitilties -- | Capitalize first letter capitInit s = case s of c:cs -> toUpper c : cs _ -> s -- | Uncapitalize first letter uncapitInit good s = case s of c:cs | not (good s) -> toLower c : cs _ -> s -- | Unquote each string wrapped in double quotes unquote = map unq where unq s = case s of '"':cs@(_:_) | last cs == '"' -> init cs _ -> s isPunct = flip elem ".?!,:;" isMajorPunct = flip elem ".?!" isMinorPunct = flip elem ",:;" isParen = flip elem "()[]{}" isClosing = flip elem ")]}"