-- | Lexers and unlexers - they work on space-separated word strings
module GF.Text.Lexing (stringOp,opInEnv,bindTok) where import GF.Text.Transliterations import Data.Char (isSpace,toUpper,toLower) import Data.List (intersperse) stringOp :: (String -> Bool) -> String -> Maybe (String -> String) stringOp good name = case name of "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) "lextext" -> Just $ appLexer (lexText good) "lexcode" -> Just $ appLexer lexCode "lexmixed" -> Just $ appLexer (lexMixed good) "lexgreek" -> Just $ appLexer lexAGreek "lexgreek2" -> Just $ appLexer lexAGreek2 "words" -> Just $ appLexer words "bind" -> Just $ appUnlexer (unwords . bindTok) "unchars" -> Just $ appUnlexer concat "unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok) "unlexcode" -> Just $ appUnlexer unlexCode "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok) "unlexgreek" -> Just $ appUnlexer unlexAGreek "unlexnone" -> Just id "unlexid" -> Just id "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML _ -> transliterate name -- perform op in environments beg--end, t.ex. between "--" --- suboptimal implementation opInEnv :: String -> String -> (String -> String) -> (String -> String) opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where chop mk@(lg, mark) s0 s = let (tag,rest) = splitAt lg s in if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest else case s of c:cs -> chop mk (c:s0) cs [] -> [reverse s0] switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg) (lbeg,lend) = (length beg, length end) altern m ts = case ts of t:ws | not m && t==beg -> t : altern True ws t:ws | m && t==end -> t : altern False ws t:ws -> (if m then op t else t) : altern m ws [] -> [] appLexer :: (String -> [String]) -> String -> String appLexer f = unwords . filter (not . null) . f appUnlexer :: ([String] -> String) -> String -> String ----appUnlexer f = unlines . map (f . words) . lines appUnlexer f = f . words wrapHTML :: String -> String wrapHTML = unlines . tag . intersperse "<br>" . lines where tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"] -- * Text lexing -- | Text lexing with standard word capitalization of the first word of every sentence lexText :: (String -> Bool) -> String -> [String] lexText good = lexText' (uncapitInit good) -- | Text lexing with custom treatment of the first word of every sentence. lexText' :: (String->String) -> String -> [String] lexText' uncap1 = uncap . lext where lext s = case s of c:cs | isMajorPunct c -> [c] : uncap (lext cs) c:cs | isMinorPunct c -> [c] : lext cs c:cs | isSpace c -> lext cs _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs _ -> [s] uncap s = case s of w:ws -> uncap1 w:ws _ -> s unlexText :: [String] -> String unlexText = capitInit . unlext where unlext s = case s of w:[] -> w w:[c]:[] | isPunct c -> w ++ [c] w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs w:ws -> w ++ " " ++ unlext ws _ -> [] -- | Bind tokens separated by Prelude.BIND, i.e. &+ bindTok :: [String] -> [String] bindTok ws = case ws of w1:"&+":w2:ws -> bindTok ((w1++w2):ws) "&+":ws -> bindTok ws "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws) "&|":ws -> bindTok ws w:ws -> w:bindTok ws [] -> [] -- * Code lexing -- | Haskell lexer, usable for much code lexCode :: String -> [String] lexCode ss = case lex ss of [(w@(_:_),ws)] -> w : lexCode ws _ -> [] -- * Ancient Greek lexing lexTextAGreek :: String -> [String] lexTextAGreek s = lext s where lext s = case s of c:cs | isAGreekPunct c -> [c] : (lext cs) c:cs | isSpace c -> lext cs _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s in w : lext cs [] -> [] -- Philological greek text may use vowel length indicators. Then '.' is not a sentence -- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c. lexTextAGreek2 :: String -> [String] lexTextAGreek2 s = lext s where lext s = case s of c:cs | isAGreekPunct c -> [c] : (lext cs) c:cs | isSpace c -> lext cs _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s in case cs of '.':'.':d:ds | isSpace d -> (w++['.']) : lext ('.':d:ds) '.':d:ds | isAGreekPunct d || isSpace d -> (w++['.']) : lext (d:ds) '.':d:ds | not (isSpace d) -> case lext (d:ds) of e:es -> (w++['.']++e) : es es -> (w++['.']) : es '.':[] -> (w++['.']) : [] _ -> w : lext cs [] -> [] unlexTextAGreek :: [String] -> String unlexTextAGreek = unlext where unlext s = case s of w:[] -> w w:[c]:[] | isAGreekPunct c -> w ++ [c] w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs w:ws -> w ++ " " ++ unlext ws [] -> [] isAGreekPunct = flip elem ".,;