module GF.Text.Lexing (stringOp,opInEnv,bindTok) where
import GF.Text.Transliterations
import Data.Char (isSpace,toUpper,toLower)
import Data.List (intersperse)
stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
stringOp good name = case name of
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
"lextext" -> Just $ appLexer (lexText good)
"lexcode" -> Just $ appLexer lexCode
"lexmixed" -> Just $ appLexer (lexMixed good)
"lexgreek" -> Just $ appLexer lexAGreek
"lexgreek2" -> Just $ appLexer lexAGreek2
"words" -> Just $ appLexer words
"bind" -> Just $ appUnlexer (unwords . bindTok)
"unchars" -> Just $ appUnlexer concat
"unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok)
"unlexcode" -> Just $ appUnlexer unlexCode
"unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
"unlexgreek" -> Just $ appUnlexer unlexAGreek
"unlexnone" -> Just id
"unlexid" -> Just id
"unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML
_ -> transliterate name
-- perform op in environments beg--end, t.ex. between "--"
--- suboptimal implementation
opInEnv :: String -> String -> (String -> String) -> (String -> String)
opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
chop mk@(lg, mark) s0 s =
let (tag,rest) = splitAt lg s in
if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest
else case s of
c:cs -> chop mk (c:s0) cs
[] -> [reverse s0]
switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
(lbeg,lend) = (length beg, length end)
altern m ts = case ts of
t:ws | not m && t==beg -> t : altern True ws
t:ws | m && t==end -> t : altern False ws
t:ws -> (if m then op t else t) : altern m ws
[] -> []
appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f
appUnlexer :: ([String] -> String) -> String -> String
----appUnlexer f = unlines . map (f . words) . lines
appUnlexer f = f . words
wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: (String -> Bool) -> String -> [String]
lexText good = lexText' (uncapitInit good)
-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
unlexText = capitInit . unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
"&+":ws -> bindTok ws
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
"&|":ws -> bindTok ws
w:ws -> w:bindTok ws
[] -> []
-- * Code lexing
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
-- * Ancient Greek lexing
lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in w : lext cs
[] -> []
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in case cs of
'.':'.':d:ds | isSpace d
-> (w++['.']) : lext ('.':d:ds)
'.':d:ds | isAGreekPunct d || isSpace d
-> (w++['.']) : lext (d:ds)
'.':d:ds | not (isSpace d)
-> case lext (d:ds) of
e:es -> (w++['.']++e) : es
es -> (w++['.']) : es
'.':[] -> (w++['.']) : []
_ -> w : lext cs
[] -> []
unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isAGreekPunct c -> w ++ [c]
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
[] -> []
isAGreekPunct = flip elem ".,;