{-# LANGUAGE ExistentialQuantification, DeriveDataTypeable, ScopedTypeVariables #-} ------------------------------------------------- -- | -- Module : PGF2 -- Maintainer : Krasimir Angelov -- Stability : stable -- Portability : portable -- -- This module is an Application Programming Interface to -- load and interpret grammars compiled in the Portable Grammar Format (PGF). -- The PGF format is produced as the final output from the GF compiler. -- The API is meant to be used for embedding GF grammars in Haskell -- programs ------------------------------------------------- #include #include #include #include #include module PGF2 (-- * PGF PGF,readPGF,showPGF, -- * Identifiers CId, -- * Abstract syntax AbsName,abstractName, -- ** Categories Cat,categories,categoryContext, -- ** Functions Fun, functions, functionsByCat, functionType, functionIsConstructor, hasLinearization, -- ** Expressions Expr,showExpr,readExpr,pExpr, mkAbs,unAbs, mkApp,unApp, mkStr,unStr, mkInt,unInt, mkFloat,unFloat, mkMeta,unMeta, mkCId, exprHash, exprSize, exprFunctions, exprSubstitute, treeProbability, -- ** Types Type, Hypo, BindType(..), startCat, readType, showType, showContext, mkType, unType, -- ** Type checking -- | Dynamically-built expressions should always be type-checked before using in other functions, -- as the exceptions thrown by using invalid expressions may not catchable. checkExpr, inferExpr, checkType, -- ** Computing compute, -- * Concrete syntax ConcName,Concr,languages,concreteName,languageCode, -- ** Linearization linearize,linearizeAll,tabularLinearize,tabularLinearizeAll,bracketedLinearize,bracketedLinearizeAll, FId, BracketedString(..), showBracketedString, flattenBracketedString, printName, categoryFields, alignWords, -- ** Parsing ParseOutput(..), parse, parseWithHeuristics, parseToChart, PArg(..), complete, -- ** Sentence Lookup lookupSentence, -- ** Generation generateAll, -- ** Morphological Analysis MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon, filterBest, filterLongest, -- ** Visualizations GraphvizOptions(..), graphvizDefaults, graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment, -- * Exceptions PGFError(..), -- * Grammar specific callbacks LiteralCallback,literalCallbacks ) where import Prelude hiding (fromEnum,(<>)) -- GHC 8.4.1 clash with Text.PrettyPrint import Control.Exception(Exception,throwIO) import Control.Monad(forM_) import System.IO.Unsafe(unsafePerformIO,unsafeInterleaveIO) import System.IO(fixIO) import Text.PrettyPrint import PGF2.Expr import PGF2.Type import PGF2.FFI import Foreign hiding ( Pool, newPool, unsafePerformIO ) import Foreign.C import Data.Typeable import qualified Data.Map as Map import Data.IORef import Data.Char(isUpper,isSpace,isPunctuation) import Data.List(isSuffixOf,maximumBy,nub) import Data.Function(on) import Data.Maybe(maybe) ----------------------------------------------------------------------- -- Functions that take a PGF. -- PGF has many Concrs. -- -- A Concr retains its PGF in a field in order to retain a reference to -- the foreign pointer in case if the application still has a reference -- to Concr but has lost its reference to PGF. type AbsName = CId -- ^ Name of abstract syntax type ConcName = CId -- ^ Name of concrete syntax -- | Reads file in Portable Grammar Format and produces -- 'PGF' structure. The file is usually produced with: -- -- > $ gf -make readPGF :: FilePath -> IO PGF readPGF fpath = do pool <- gu_new_pool pgf <- withCString fpath $ \c_fpath -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl pgf <- pgf_read c_fpath pool exn failed <- gu_exn_is_raised exn if failed then do is_errno <- gu_exn_caught exn gu_exn_type_GuErrno if is_errno then do perrno <- (#peek GuExn, data.data) exn errno <- peek perrno gu_pool_free pool ioError (errnoToIOError "readPGF" (Errno errno) Nothing (Just fpath)) else do gu_pool_free pool throwIO (PGFError "The grammar cannot be loaded") else return pgf pgfFPtr <- newForeignPtr gu_pool_finalizer pool return (PGF pgf (touchForeignPtr pgfFPtr)) showPGF :: PGF -> String showPGF p = unsafePerformIO $ withGuPool $ \tmpPl -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl pgf_print (pgf p) out exn touchPGF p s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s -- | List of all languages available in the grammar. languages :: PGF -> Map.Map ConcName Concr languages p = unsafePerformIO $ do ref <- newIORef Map.empty allocaBytes (#size GuMapItor) $ \itor -> do fptr <- wrapMapItorCallback (getLanguages ref) (#poke GuMapItor, fn) itor fptr pgf_iter_languages (pgf p) itor nullPtr freeHaskellFunPtr fptr readIORef ref where getLanguages :: IORef (Map.Map String Concr) -> MapItorCallback getLanguages ref itor key value exn = do langs <- readIORef ref name <- peekUtf8CString (castPtr key) concr <- fmap (\ptr -> Concr ptr (touchPGF p)) $ peek (castPtr value) writeIORef ref $! Map.insert name concr langs concreteName :: Concr -> ConcName concreteName c = unsafePerformIO (peekUtf8CString =<< pgf_concrete_name (concr c)) languageCode :: Concr -> String languageCode c = unsafePerformIO (peekUtf8CString =<< pgf_language_code (concr c)) -- | Generates an exhaustive possibly infinite list of -- all abstract syntax expressions of the given type. -- The expressions are ordered by their probability. generateAll :: PGF -> Type -> [(Expr,Float)] generateAll p (Type ctype _) = unsafePerformIO $ do genPl <- gu_new_pool exprPl <- gu_new_pool exn <- gu_new_exn genPl enum <- pgf_generate_all (pgf p) ctype exn genPl exprPl genFPl <- newForeignPtr gu_pool_finalizer genPl exprFPl <- newForeignPtr gu_pool_finalizer exprPl fromPgfExprEnum enum genFPl (touchPGF p >> touchForeignPtr exprFPl) -- | The abstract language name is the name of the top-level -- abstract module abstractName :: PGF -> AbsName abstractName p = unsafePerformIO (peekUtf8CString =<< pgf_abstract_name (pgf p)) -- | The start category is defined in the grammar with -- the \'startcat\' flag. This is usually the sentence category -- but it is not necessary. Despite that there is a start category -- defined you can parse with any category. The start category -- definition is just for convenience. startCat :: PGF -> Type startCat p = unsafePerformIO $ do typPl <- gu_new_pool c_type <- pgf_start_cat (pgf p) typPl typeFPl <- newForeignPtr gu_pool_finalizer typPl return (Type c_type (touchForeignPtr typeFPl)) loadConcr :: Concr -> FilePath -> IO () loadConcr c fpath = withCString fpath $ \c_fpath -> withCString "rb" $ \c_mode -> withGuPool $ \tmpPl -> do file <- fopen c_fpath c_mode inp <- gu_file_in file tmpPl exn <- gu_new_exn tmpPl pgf_concrete_load (concr c) inp exn failed <- gu_exn_is_raised exn if failed then do is_errno <- gu_exn_caught exn gu_exn_type_GuErrno if is_errno then do perrno <- (#peek GuExn, data.data) exn errno <- peek perrno ioError (errnoToIOError "loadConcr" (Errno errno) Nothing (Just fpath)) else do throwIO (PGFError "The language cannot be loaded") else return () unloadConcr :: Concr -> IO () unloadConcr c = pgf_concrete_unload (concr c) -- | The type of a function functionType :: PGF -> Fun -> Maybe Type functionType p fn = unsafePerformIO $ withGuPool $ \tmpPl -> do c_fn <- newUtf8CString fn tmpPl c_type <- pgf_function_type (pgf p) c_fn return (if c_type == nullPtr then Nothing else Just (Type c_type (touchPGF p))) -- | The type of a function functionIsConstructor :: PGF -> Fun -> Bool functionIsConstructor p fn = unsafePerformIO $ withGuPool $ \tmpPl -> do c_fn <- newUtf8CString fn tmpPl res <- pgf_function_is_constructor (pgf p) c_fn touchPGF p return (res /= 0) -- | Checks an expression against a specified type. checkExpr :: PGF -> Expr -> Type -> Either String Expr checkExpr (PGF p _) (Expr c_expr touch1) (Type c_ty touch2) = unsafePerformIO $ alloca $ \pexpr -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl exprPl <- gu_new_pool poke pexpr c_expr pgf_check_expr p pexpr c_ty exn exprPl touch1 >> touch2 status <- gu_exn_is_raised exn if not status then do exprFPl <- newForeignPtr gu_pool_finalizer exprPl c_expr <- peek pexpr return (Right (Expr c_expr (touchForeignPtr exprFPl))) else do is_tyerr <- gu_exn_caught exn gu_exn_type_PgfTypeError c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free exprPl if is_tyerr then return (Left msg) else throwIO (PGFError msg) -- | Tries to infer the type of an expression. Note that -- even if the expression is type correct it is not always -- possible to infer its type in the GF type system. -- In this case the function returns an error. inferExpr :: PGF -> Expr -> Either String (Expr, Type) inferExpr (PGF p _) (Expr c_expr touch1) = unsafePerformIO $ alloca $ \pexpr -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl exprPl <- gu_new_pool poke pexpr c_expr c_ty <- pgf_infer_expr p pexpr exn exprPl touch1 status <- gu_exn_is_raised exn if not status then do exprFPl <- newForeignPtr gu_pool_finalizer exprPl let touch = touchForeignPtr exprFPl c_expr <- peek pexpr return (Right (Expr c_expr touch, Type c_ty touch)) else do is_tyerr <- gu_exn_caught exn gu_exn_type_PgfTypeError c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free exprPl if is_tyerr then return (Left msg) else throwIO (PGFError msg) -- | Check whether a type is consistent with the abstract -- syntax of the grammar. checkType :: PGF -> Type -> Either String Type checkType (PGF p _) (Type c_ty touch1) = unsafePerformIO $ alloca $ \pty -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl typePl <- gu_new_pool poke pty c_ty pgf_check_type p pty exn typePl touch1 status <- gu_exn_is_raised exn if not status then do typeFPl <- newForeignPtr gu_pool_finalizer typePl c_ty <- peek pty return (Right (Type c_ty (touchForeignPtr typeFPl))) else do is_tyerr <- gu_exn_caught exn gu_exn_type_PgfTypeError c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free typePl if is_tyerr then return (Left msg) else throwIO (PGFError msg) compute :: PGF -> Expr -> Expr compute (PGF p _) (Expr c_expr touch1) = unsafePerformIO $ withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl exprPl <- gu_new_pool c_expr <- pgf_compute p c_expr exn tmpPl exprPl touch1 status <- gu_exn_is_raised exn if not status then do exprFPl <- newForeignPtr gu_pool_finalizer exprPl return (Expr c_expr (touchForeignPtr exprFPl)) else do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free exprPl throwIO (PGFError msg) treeProbability :: PGF -> Expr -> Float treeProbability (PGF p _) (Expr c_expr touch1) = unsafePerformIO $ do res <- pgf_compute_tree_probability p c_expr touch1 return (realToFrac res) exprHash :: Int32 -> Expr -> Int32 exprHash h (Expr c_expr touch1) = unsafePerformIO $ do h <- pgf_expr_hash (fromIntegral h) c_expr touch1 return (fromIntegral h) exprSize :: Expr -> Int exprSize (Expr c_expr touch1) = unsafePerformIO $ do size <- pgf_expr_size c_expr touch1 return (fromIntegral size) exprFunctions :: Expr -> [Fun] exprFunctions (Expr c_expr touch) = unsafePerformIO $ withGuPool $ \tmpPl -> do seq <- pgf_expr_functions c_expr tmpPl len <- (#peek GuSeq, len) seq arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data)) funs <- mapM peekUtf8CString arr touch return funs exprSubstitute :: Expr -> [Expr] -> Expr exprSubstitute (Expr c_expr touch) meta_values = unsafePerformIO $ withGuPool $ \tmpPl -> do c_meta_values <- newSequence (#size PgfExpr) pokeExpr meta_values tmpPl exprPl <- gu_new_pool c_expr <- pgf_expr_substitute c_expr c_meta_values exprPl touch exprFPl <- newForeignPtr gu_pool_finalizer exprPl let touch' = sequence_ (touchForeignPtr exprFPl : map touchExpr meta_values) return (Expr c_expr touch') where pokeExpr ptr (Expr c_expr _) = poke ptr c_expr ----------------------------------------------------------------------------- -- Graphviz data GraphvizOptions = GraphvizOptions {noLeaves :: Bool, noFun :: Bool, noCat :: Bool, noDep :: Bool, nodeFont :: String, leafFont :: String, nodeColor :: String, leafColor :: String, nodeEdgeStyle :: String, leafEdgeStyle :: String } graphvizDefaults = GraphvizOptions False False False True "" "" "" "" "" "" -- | Renders an abstract syntax tree in a Graphviz format. graphvizAbstractTree :: PGF -> GraphvizOptions -> Expr -> String graphvizAbstractTree p opts e = unsafePerformIO $ withGuPool $ \tmpPl -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl c_opts <- newGraphvizOptions tmpPl opts pgf_graphviz_abstract_tree (pgf p) (expr e) c_opts out exn touchExpr e s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s graphvizParseTree :: Concr -> GraphvizOptions -> Expr -> String graphvizParseTree c opts e = unsafePerformIO $ withGuPool $ \tmpPl -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl c_opts <- newGraphvizOptions tmpPl opts pgf_graphviz_parse_tree (concr c) (expr e) c_opts out exn touchExpr e touchConcr c s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s graphvizWordAlignment :: [Concr] -> GraphvizOptions -> Expr -> String graphvizWordAlignment cs opts e = unsafePerformIO $ withGuPool $ \tmpPl -> withArrayLen (map concr cs) $ \n_concrs ptr -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl c_opts <- newGraphvizOptions tmpPl opts pgf_graphviz_word_alignment ptr (fromIntegral n_concrs) (expr e) c_opts out exn touchExpr e s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s newGraphvizOptions :: Ptr GuPool -> GraphvizOptions -> IO (Ptr PgfGraphvizOptions) newGraphvizOptions pool opts = do c_opts <- gu_malloc pool (#size PgfGraphvizOptions) (#poke PgfGraphvizOptions, noLeaves) c_opts (if noLeaves opts then 1 else 0 :: CInt) (#poke PgfGraphvizOptions, noFun) c_opts (if noFun opts then 1 else 0 :: CInt) (#poke PgfGraphvizOptions, noCat) c_opts (if noCat opts then 1 else 0 :: CInt) (#poke PgfGraphvizOptions, noDep) c_opts (if noDep opts then 1 else 0 :: CInt) newUtf8CString (nodeFont opts) pool >>= (#poke PgfGraphvizOptions, nodeFont) c_opts newUtf8CString (leafFont opts) pool >>= (#poke PgfGraphvizOptions, leafFont) c_opts newUtf8CString (nodeColor opts) pool >>= (#poke PgfGraphvizOptions, nodeColor) c_opts newUtf8CString (leafColor opts) pool >>= (#poke PgfGraphvizOptions, leafColor) c_opts newUtf8CString (nodeEdgeStyle opts) pool >>= (#poke PgfGraphvizOptions, nodeEdgeStyle) c_opts newUtf8CString (leafEdgeStyle opts) pool >>= (#poke PgfGraphvizOptions, leafEdgeStyle) c_opts return c_opts ----------------------------------------------------------------------------- -- Functions using Concr -- Morpho analyses, parsing & linearization -- | This triple is returned by all functions that deal with -- the grammar's lexicon. Its first element is the name of an abstract -- lexical function which can produce a given word or -- a multiword expression (i.e. this is the lemma). -- After that follows a string which describes -- the particular inflection form. -- -- The last element is a logarithm from the -- the probability of the function. The probability is not -- conditionalized on the category of the function. This makes it -- possible to compare the likelihood of two functions even if they -- have different types. type MorphoAnalysis = (Fun,String,Float) -- | 'lookupMorpho' takes a string which must be a single word or -- a multiword expression. It then computes the list of all possible -- morphological analyses. lookupMorpho :: Concr -> String -> [MorphoAnalysis] lookupMorpho (Concr concr master) sent = unsafePerformIO $ withGuPool $ \tmpPl -> do ref <- newIORef [] cback <- gu_malloc tmpPl (#size PgfMorphoCallback) fptr <- wrapLookupMorphoCallback (getAnalysis ref) (#poke PgfMorphoCallback, callback) cback fptr c_sent <- newUtf8CString sent tmpPl pgf_lookup_morpho concr c_sent cback nullPtr freeHaskellFunPtr fptr readIORef ref -- | 'lookupCohorts' takes an arbitrary string an produces -- a list of all places where lexical items from the grammar have been -- identified (i.e. cohorts). The list consists of triples of the format @(start,ans,end)@, -- where @start-end@ identifies the span in the text and @ans@ is -- the list of possible morphological analyses similar to 'lookupMorpho'. -- -- The list is sorted first by the @start@ position and after than -- by the @end@ position. This can be used for instance if you want to -- filter only the longest matches. lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)] lookupCohorts lang@(Concr concr master) sent = unsafePerformIO $ do pl <- gu_new_pool ref <- newIORef [] cback <- gu_malloc pl (#size PgfMorphoCallback) fptr <- wrapLookupMorphoCallback (getAnalysis ref) (#poke PgfMorphoCallback, callback) cback fptr c_sent <- newUtf8CString sent pl enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr fpl <- newForeignPtr gu_pool_finalizer pl fromCohortRange enum fpl fptr 0 sent ref where fromCohortRange enum fpl fptr i sent ref = allocaBytes (#size PgfCohortRange) $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl buf <- (#peek PgfCohortRange, buf) ptr if buf == nullPtr then do finalizeForeignPtr fpl freeHaskellFunPtr fptr touchConcr lang return [] else do start <- (#peek PgfCohortRange, start.pos) ptr end <- (#peek PgfCohortRange, end.pos) ptr ans <- readIORef ref writeIORef ref [] let sent' = drop (start-i) sent tok = take (end-start) sent' cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr start sent' ref) return ((start,tok,ans,end):cohs) filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)] filterBest ans = reverse (iterate (maxBound :: Int) [(0,0,[],ans)] [] []) where iterate v0 [] [] res = res iterate v0 [] new res = iterate v0 new [] res iterate v0 ((_,v,conf, []):old) new res = case compare v0 v of LT -> res EQ -> iterate v0 old new (merge conf res) GT -> iterate v old new conf iterate v0 ((_,v,conf,an:ans):old) new res = iterate v0 old (insert (v+valueOf an) conf an ans [] new) res valueOf (_,_,[],_) = 2 valueOf _ = 1 insert v conf an@(start,_,_,end) ans l_new [] = match start v conf ans ((end,v,comb conf an,filter end ans):l_new) [] insert v conf an@(start,_,_,end) ans l_new (new@(end0,v0,conf0,ans0):r_new) = case compare end0 end of LT -> insert v conf an ans (new:l_new) r_new EQ -> case compare v0 v of LT -> match start v conf ans ((end,v, conf0,ans0): l_new) r_new EQ -> match start v conf ans ((end,v,merge (comb conf an) conf0,ans0): l_new) r_new GT -> match start v conf ans ((end,v,comb conf an, ans0): l_new) r_new GT -> match start v conf ans ((end,v,comb conf an, filter end ans):new:l_new) r_new match start0 v conf (an@(start,_,_,end):ans) l_new r_new | start0 == start = insert v conf an ans l_new r_new match start0 v conf ans l_new r_new = revOn l_new r_new comb ((start0,w0,an0,end0):conf) (start,w,an,end) | end0 == start && (unk w0 an0 || unk w an) = (start0,w0++w,[],end):conf comb conf an = an:conf filter end [] = [] filter end (next@(start,_,_,_):ans) | end <= start = next:ans | otherwise = filter end ans revOn [] ys = ys revOn (x:xs) ys = revOn xs (x:ys) merge [] ans = ans merge ans [] = ans merge (an1@(start1,_,_,end1):ans1) (an2@(start2,_,_,end2):ans2) = case compare (start1,end1) (start2,end2) of GT -> an1 : merge ans1 (an2:ans2) EQ -> an1 : merge ans1 ans2 LT -> an2 : merge (an1:ans1) ans2 filterLongest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)] filterLongest [] = [] filterLongest (an:ans) = longest an ans where longest prev [] = [prev] longest prev@(start0,_,_,end0) (next@(start,_,_,end):ans) | start0 == start = longest next ans | otherwise = filter prev (next:ans) filter prev [] = [prev] filter prev@(start0,w0,an0,end0) (next@(start,w,an,end):ans) | end0 == start && (unk w0 an0 || unk w an) = filter (start0,w0++w,[],end) ans | end0 <= start = prev : longest next ans | otherwise = filter prev ans unk w [] | any (not . isPunctuation) w = True unk _ _ = False fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])] fullFormLexicon lang = unsafePerformIO $ do pl <- gu_new_pool enum <- pgf_fullform_lexicon (concr lang) pl fpl <- newForeignPtr gu_pool_finalizer pl fromFullFormEntry enum fpl where fromFullFormEntry :: Ptr GuEnum -> ForeignPtr GuPool -> IO [(String, [MorphoAnalysis])] fromFullFormEntry enum fpl = do ffEntry <- alloca $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl peek ptr if ffEntry == nullPtr then do finalizeForeignPtr fpl touchConcr lang return [] else do tok <- peekUtf8CString =<< pgf_fullform_get_string ffEntry ref <- newIORef [] allocaBytes (#size PgfMorphoCallback) $ \cback -> do fptr <- wrapLookupMorphoCallback (getAnalysis ref) (#poke PgfMorphoCallback, callback) cback fptr pgf_fullform_get_analyses ffEntry cback nullPtr ans <- readIORef ref toks <- unsafeInterleaveIO (fromFullFormEntry enum fpl) return ((tok,ans) : toks) getAnalysis :: IORef [MorphoAnalysis] -> LookupMorphoCallback getAnalysis ref self c_lemma c_anal prob exn = do ans <- readIORef ref lemma <- peekUtf8CString c_lemma anal <- peekUtf8CString c_anal writeIORef ref ((lemma, anal, prob):ans) -- | This data type encodes the different outcomes which you could get from the parser. data ParseOutput a = ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed. -- The string is the token where the parser have failed. | ParseOk a -- ^ If the parsing and the type checking are successful -- we get the abstract syntax trees as either a list or a chart. | ParseIncomplete -- ^ The sentence is not complete. parse :: Concr -> Type -> String -> ParseOutput [(Expr,Float)] parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) [] parseWithHeuristics :: Concr -- ^ the language with which we parse -> Type -- ^ the start category -> String -- ^ the input sentence -> Double -- ^ the heuristic factor. -- A negative value tells the parser -- to lookup up the default from -- the grammar flags -> [(Cat, String -> Int -> Maybe (Expr,Float,Int))] -- ^ a list of callbacks for literal categories. -- The arguments of the callback are: -- the index of the constituent for the literal category; -- the input sentence; the current offset in the sentence. -- If a literal has been recognized then the output should -- be Just (expr,probability,end_offset) -> ParseOutput [(Expr,Float)] parseWithHeuristics lang (Type ctype touchType) sent heuristic callbacks = unsafePerformIO $ do exprPl <- gu_new_pool parsePl <- gu_new_pool exn <- gu_new_exn parsePl sent <- newUtf8CString sent parsePl callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl enum <- pgf_parse_with_heuristics (concr lang) ctype sent heuristic callbacks_map exn parsePl exprPl touchType failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_err <- (#peek GuExn, data.data) exn c_incomplete <- (#peek PgfParseError, incomplete) c_err if (c_incomplete :: CInt) == 0 then do c_offset <- (#peek PgfParseError, offset) c_err token_ptr <- (#peek PgfParseError, token_ptr) c_err token_len <- (#peek PgfParseError, token_len) c_err tok <- peekUtf8CStringLen token_ptr token_len gu_pool_free parsePl gu_pool_free exprPl return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) else do gu_pool_free parsePl gu_pool_free exprPl return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError msg) else do gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError "Parsing failed") else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return (ParseOk exprs) parseToChart :: Concr -- ^ the language with which we parse -> Type -- ^ the start category -> String -- ^ the input sentence -> Double -- ^ the heuristic factor. -- A negative value tells the parser -- to lookup up the default from -- the grammar flags -> [(Cat, String -> Int -> Maybe (Expr,Float,Int))] -- ^ a list of callbacks for literal categories. -- The arguments of the callback are: -- the index of the constituent for the literal category; -- the input sentence; the current offset in the sentence. -- If a literal has been recognized then the output should -- be Just (expr,probability,end_offset) -> Int -- ^ the maximal number of roots -> ParseOutput ([FId],Map.Map FId ([(Int,Int,String)],[(Expr,[PArg],Float)],Cat)) parseToChart lang (Type ctype touchType) sent heuristic callbacks roots = unsafePerformIO $ withGuPool $ \parsePl -> do do exn <- gu_new_exn parsePl sent <- newUtf8CString sent parsePl callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl ps <- pgf_parse_to_chart (concr lang) ctype sent heuristic callbacks_map (fromIntegral roots) exn parsePl parsePl touchType failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_err <- (#peek GuExn, data.data) exn c_incomplete <- (#peek PgfParseError, incomplete) c_err if (c_incomplete :: CInt) == 0 then do c_offset <- (#peek PgfParseError, offset) c_err token_ptr <- (#peek PgfParseError, token_ptr) c_err token_len <- (#peek PgfParseError, token_len) c_err tok <- peekUtf8CStringLen token_ptr token_len touchConcr lang return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) else do touchConcr lang return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg touchConcr lang throwIO (PGFError msg) else do touchConcr lang throwIO (PGFError "Parsing failed") else do c_roots <- pgf_get_parse_roots ps parsePl let get_range c_ccat = pgf_ccat_to_range ps c_ccat parsePl c_len <- (#peek GuSeq, len) c_roots chart <- peekCCats get_range Map.empty (c_len :: CSizeT) (c_roots `plusPtr` (#offset GuSeq, data)) touchConcr lang return (ParseOk chart) where peekCCats get_range chart 0 ptr = return ([],chart) peekCCats get_range chart len ptr = do (root, chart) <- deRef (peekCCat get_range chart) ptr (roots,chart) <- peekCCats get_range chart (len-1) (ptr `plusPtr` (#size PgfCCat*)) return (root:roots,chart) peekCCat get_range chart c_ccat = do fid <- peekFId c_ccat c_total_cats <- (#peek PgfConcr, total_cats) (concr lang) if Map.member fid chart || fid < c_total_cats then return (fid,chart) else do c_cnccat <- (#peek PgfCCat, cnccat) c_ccat c_abscat <- (#peek PgfCCat, cnccat) c_cnccat c_name <- (#peek PgfCCat, cnccat) c_abscat cat <- peekUtf8CString c_name range <- get_range c_ccat >>= peekSequence peekRange (#size PgfParseRange) c_prods <- (#peek PgfCCat, prods) c_ccat if c_prods == nullPtr then do return (fid,Map.insert fid (range,[],cat) chart) else do c_len <- (#peek PgfCCat, n_synprods) c_ccat (prods,chart) <- fixIO (\res -> peekProductions (Map.insert fid (range,fst res,cat) chart) (fromIntegral (c_len :: CSizeT)) (c_prods `plusPtr` (#offset GuSeq, data))) return (fid,chart) where peekProductions chart 0 ptr = return ([],chart) peekProductions chart len ptr = do (ps1,chart) <- deRef (peekProduction chart) ptr (ps2,chart) <- peekProductions chart (len-1) (ptr `plusPtr` (#size GuVariant)) return (ps1++ps2,chart) peekProduction chart p = do tag <- gu_variant_tag p dt <- gu_variant_data p case tag of (#const PGF_PRODUCTION_APPLY) -> do { c_cncfun <- (#peek PgfProductionApply, fun) dt ; c_absfun <- (#peek PgfCncFun, absfun) c_cncfun ; expr <- (#peek PgfAbsFun, ep.expr) c_absfun ; p <- (#peek PgfAbsFun, ep.prob) c_absfun ; c_args <- (#peek PgfProductionApply, args) dt ; c_len <- (#peek GuSeq, len) c_args ; (pargs,chart) <- peekPArgs chart (c_len :: CSizeT) (c_args `plusPtr` (#offset GuSeq, data)) ; return ([(Expr expr (touchConcr lang), pargs, p)],chart) } (#const PGF_PRODUCTION_COERCE) -> do { c_coerce <- (#peek PgfProductionCoerce, coerce) dt ; (fid,chart) <- peekCCat get_range chart c_coerce ; return (maybe [] snd3 (Map.lookup fid chart),chart) } (#const PGF_PRODUCTION_EXTERN) -> do { c_ep <- (#peek PgfProductionExtern, ep) dt ; expr <- (#peek PgfExprProb, expr) c_ep ; p <- (#peek PgfExprProb, prob) c_ep ; return ([(Expr expr (touchConcr lang), [], p)],chart) } _ -> error ("Unknown production type "++show tag++" in the grammar") snd3 (_,x,_) = x peekPArgs chart 0 ptr = return ([],chart) peekPArgs chart len ptr = do (a, chart) <- peekPArg chart ptr (as,chart) <- peekPArgs chart (len-1) (ptr `plusPtr` (#size PgfPArg)) return (a:as,chart) peekPArg chart ptr = do c_hypos <- (#peek PgfPArg, hypos) ptr hypos <- if c_hypos /= nullPtr then peekSequence (deRef peekFId) (#size int) c_hypos else return [] c_ccat <- (#peek PgfPArg, ccat) ptr (fid,chart) <- peekCCat get_range chart c_ccat return (PArg hypos fid,chart) peekRange ptr = do s <- (#peek PgfParseRange, start) ptr e <- (#peek PgfParseRange, end) ptr f <- (#peek PgfParseRange, field) ptr >>= peekCString return ((fromIntegral :: CSizeT -> Int) s, (fromIntegral :: CSizeT -> Int) e, f) mkCallbacksMap :: Ptr PgfConcr -> [(String, String -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap) mkCallbacksMap concr callbacks pool = do callbacks_map <- pgf_new_callbacks_map concr pool forM_ callbacks $ \(cat,match) -> do ccat <- newUtf8CString cat pool match <- wrapLiteralMatchCallback (match_callback match) predict <- wrapLiteralPredictCallback predict_callback hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool return callbacks_map where match_callback match c_ann poffset out_pool = do coffset <- peek poffset ann <- peekUtf8CString c_ann case match ann (fromIntegral coffset) of Nothing -> return nullPtr Just (e,prob,offset') -> do poke poffset (fromIntegral offset') -- here we copy the expression to out_pool c_e <- pgf_clone_expr (expr e) out_pool ep <- gu_malloc out_pool (#size PgfExprProb) (#poke PgfExprProb, expr) ep c_e (#poke PgfExprProb, prob) ep prob return ep predict_callback _ _ _ = return nullPtr lookupSentence :: Concr -- ^ the language with which we parse -> Type -- ^ the start category -> String -- ^ the input sentence -> [(Expr,Float)] lookupSentence lang (Type ctype _) sent = unsafePerformIO $ do exprPl <- gu_new_pool parsePl <- gu_new_pool sent <- newUtf8CString sent parsePl enum <- pgf_lookup_sentence (concr lang) ctype sent parsePl exprPl parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return exprs -- | The oracle is a triple of functions. -- The first two take a category name and a linearization field name -- and they should return True/False when the corresponding -- prediction or completion is appropriate. The third function -- is the oracle for literals. type Oracle = (Maybe (Cat -> String -> Int -> Bool) ,Maybe (Cat -> String -> Int -> Bool) ,Maybe (Cat -> String -> Int -> Maybe (Expr,Float,Int)) ) parseWithOracle :: Concr -- ^ the language with which we parse -> Cat -- ^ the start category -> String -- ^ the input sentence -> Oracle -> ParseOutput [(Expr,Float)] parseWithOracle lang cat sent (predict,complete,literal) = unsafePerformIO $ do parsePl <- gu_new_pool exprPl <- gu_new_pool exn <- gu_new_exn parsePl cat <- newUtf8CString cat parsePl sent <- newUtf8CString sent parsePl predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl enum <- pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_err <- (#peek GuExn, data.data) exn c_incomplete <- (#peek PgfParseError, incomplete) c_err if (c_incomplete :: CInt) == 0 then do c_offset <- (#peek PgfParseError, offset) c_err token_ptr <- (#peek PgfParseError, token_ptr) c_err token_len <- (#peek PgfParseError, token_len) c_err tok <- peekUtf8CStringLen token_ptr token_len gu_pool_free parsePl gu_pool_free exprPl return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) else do gu_pool_free parsePl gu_pool_free exprPl return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError msg) else do gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError "Parsing failed") else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return (ParseOk exprs) where oracleWrapper oracle catPtr lblPtr offset = do cat <- peekUtf8CString catPtr lbl <- peekUtf8CString lblPtr return (oracle cat lbl (fromIntegral offset)) oracleLiteralWrapper oracle catPtr lblPtr poffset out_pool = do cat <- peekUtf8CString catPtr lbl <- peekUtf8CString lblPtr offset <- peek poffset case oracle cat lbl (fromIntegral offset) of Just (e,prob,offset) -> do poke poffset (fromIntegral offset) -- here we copy the expression to out_pool c_e <- withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl (sb,out) <- newOut tmpPl let printCtxt = nullPtr pgf_print_expr (expr e) printCtxt 1 out exn c_str <- gu_string_buf_freeze sb tmpPl guin <- gu_string_in c_str tmpPl pgf_read_expr guin out_pool tmpPl exn ep <- gu_malloc out_pool (#size PgfExprProb) (#poke PgfExprProb, expr) ep c_e (#poke PgfExprProb, prob) ep prob return ep Nothing -> do return nullPtr -- | Returns possible completions of the current partial input. complete :: Concr -- ^ the language with which we parse -> Type -- ^ the start category -> String -- ^ the input sentence (excluding token being completed) -> String -- ^ prefix (partial token being completed) -> ParseOutput [(String, CId, CId, Float)] -- ^ (token, category, function, probability) complete lang (Type ctype _) sent pfx = unsafePerformIO $ do parsePl <- gu_new_pool exn <- gu_new_exn parsePl sent <- newUtf8CString sent parsePl pfx <- newUtf8CString pfx parsePl enum <- pgf_complete (concr lang) ctype sent pfx exn parsePl failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_err <- (#peek GuExn, data.data) exn c_offset <- (#peek PgfParseError, offset) c_err token_ptr <- (#peek PgfParseError, token_ptr) c_err token_len <- (#peek PgfParseError, token_len) c_err tok <- peekUtf8CStringLen token_ptr token_len gu_pool_free parsePl return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free parsePl throwIO (PGFError msg) else do gu_pool_free parsePl throwIO (PGFError "Parsing failed") else do fpl <- newForeignPtr gu_pool_finalizer parsePl ParseOk <$> fromCompletions enum fpl where fromCompletions :: Ptr GuEnum -> ForeignPtr GuPool -> IO [(String, CId, CId, Float)] fromCompletions enum fpl = withGuPool $ \tmpPl -> do cmpEntry <- alloca $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl peek ptr if cmpEntry == nullPtr then do finalizeForeignPtr fpl touchConcr lang return [] else do p_tok <- (#peek PgfTokenProb, tok) cmpEntry tok <- if p_tok == nullPtr then return "&+" else peekUtf8CString p_tok cat <- peekUtf8CString =<< (#peek PgfTokenProb, cat) cmpEntry fun <- peekUtf8CString =<< (#peek PgfTokenProb, fun) cmpEntry prob <- (#peek PgfTokenProb, prob) cmpEntry toks <- unsafeInterleaveIO (fromCompletions enum fpl) return ((tok, cat, fun, prob) : toks) -- | Returns True if there is a linearization defined for that function in that language hasLinearization :: Concr -> Fun -> Bool hasLinearization lang id = unsafePerformIO $ withGuPool $ \pl -> do res <- newUtf8CString id pl >>= pgf_has_linearization (concr lang) return (res /= 0) -- | Linearizes an expression as a string in the language linearize :: Concr -> Expr -> String linearize lang e = unsafePerformIO $ withGuPool $ \pl -> do (sb,out) <- newOut pl exn <- gu_new_exn pl pgf_linearize (concr lang) (expr e) out exn touchExpr e failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then return "" else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else throwIO (PGFError "The abstract tree cannot be linearized") else do lin <- gu_string_buf_freeze sb pl peekUtf8CString lin -- | Generates all possible linearizations of an expression linearizeAll :: Concr -> Expr -> [String] linearizeAll lang e = unsafePerformIO $ do pl <- gu_new_pool exn <- gu_new_exn pl cts <- pgf_lzr_concretize (concr lang) (expr e) exn pl failed <- gu_exn_is_raised exn if failed then throwExn exn pl else collect cts exn pl where collect cts exn pl = withGuPool $ \tmpPl -> do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr tmpPl peek ptr if ctree == nullPtr then do gu_pool_free pl touchExpr e return [] else do (sb,out) <- newOut tmpPl ctree <- pgf_lzr_wrap_linref ctree tmpPl pgf_lzr_linearize_simple (concr lang) ctree 0 out exn tmpPl failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then collect cts exn pl else throwExn exn pl else do lin <- gu_string_buf_freeze sb tmpPl s <- peekUtf8CString lin ss <- collect cts exn pl return (s:ss) throwExn exn pl = do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free pl throwIO (PGFError msg) else do gu_pool_free pl throwIO (PGFError "The abstract tree cannot be linearized") -- | Generates a table of linearizations for an expression tabularLinearize :: Concr -> Expr -> [(String, String)] tabularLinearize lang e = case tabularLinearizeAll lang e of (lins:_) -> lins _ -> [] -- | Generates a table of linearizations for an expression tabularLinearizeAll :: Concr -> Expr -> [[(String, String)]] tabularLinearizeAll lang e = unsafePerformIO $ withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl cts <- pgf_lzr_concretize (concr lang) (expr e) exn tmpPl failed <- gu_exn_is_raised exn touchConcr lang if failed then throwExn exn else collect cts exn tmpPl where collect cts exn tmpPl = do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr tmpPl peek ptr if ctree == nullPtr then do touchExpr e return [] else do labels <- alloca $ \p_n_lins -> alloca $ \p_labels -> do pgf_lzr_get_table (concr lang) ctree p_n_lins p_labels n_lins <- peek p_n_lins labels <- peek p_labels labels <- peekArray (fromIntegral n_lins) labels labels <- mapM peekCString labels return labels lins <- collectTable lang ctree 0 labels exn tmpPl linss <- collect cts exn tmpPl return (lins : linss) collectTable lang ctree lin_idx [] exn tmpPl = return [] collectTable lang ctree lin_idx (label:labels) exn tmpPl = do (sb,out) <- newOut tmpPl pgf_lzr_linearize_simple (concr lang) ctree lin_idx out exn tmpPl failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then collectTable lang ctree (lin_idx+1) labels exn tmpPl else throwExn exn else do lin <- gu_string_buf_freeze sb tmpPl s <- peekUtf8CString lin ss <- collectTable lang ctree (lin_idx+1) labels exn tmpPl return ((label,s):ss) throwExn exn = do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else do throwIO (PGFError "The abstract tree cannot be linearized") categoryFields :: Concr -> Cat -> Maybe [String] categoryFields lang cat = unsafePerformIO $ do withGuPool $ \tmpPl -> do p_n_lins <- gu_malloc tmpPl (#size size_t) c_cat <- newUtf8CString cat tmpPl c_fields <- pgf_category_fields (concr lang) c_cat p_n_lins if c_fields == nullPtr then do touchConcr lang return Nothing else do len <- peek p_n_lins fs <- peekFields len c_fields touchConcr lang return (Just fs) where peekFields 0 ptr = return [] peekFields len ptr = do f <- peek ptr >>= peekUtf8CString fs <- peekFields (len-1) (ptr `plusPtr` (#size GuString)) return (f:fs) -- | BracketedString represents a sentence that is linearized -- as usual but we also want to retain the ''brackets'' that -- mark the beginning and the end of each constituent. data BracketedString = Leaf String -- ^ this is the leaf i.e. a single token | BIND -- ^ the surrounding tokens must be bound together | Bracket CId {-# UNPACK #-} !FId String CId [BracketedString] -- ^ this is a bracket. The 'CId' is the category of -- the phrase. The 'FId' is an unique identifier for -- every phrase in the sentence. For context-free grammars -- i.e. without discontinuous constituents this identifier -- is also unique for every bracket. When there are discontinuous -- phrases then the identifiers are unique for every phrase but -- not for every bracket since the bracket represents a constituent. -- The different constituents could still be distinguished by using -- the analysis string. If the grammar is reduplicating -- then the constituent indices will be the same for all brackets -- that represents the same constituent. -- The second 'CId' is the name of the abstract function that generated -- this phrase. -- | Renders the bracketed string as a string where -- the brackets are shown as @(S ...)@ where -- @S@ is the category. showBracketedString :: BracketedString -> String showBracketedString = render . ppBracketedString ppBracketedString (Leaf t) = text t ppBracketedString BIND = text "&+" ppBracketedString (Bracket cat fid _ _ bss) = parens (text cat <> colon <> int fid <+> hsep (map ppBracketedString bss)) -- | Extracts the sequence of tokens from the bracketed string flattenBracketedString :: BracketedString -> [String] flattenBracketedString (Leaf w) = [w] flattenBracketedString BIND = [] flattenBracketedString (Bracket _ _ _ _ bss) = concatMap flattenBracketedString bss bracketedLinearize :: Concr -> Expr -> [BracketedString] bracketedLinearize lang e = unsafePerformIO $ withGuPool $ \pl -> do exn <- gu_new_exn pl cts <- pgf_lzr_concretize (concr lang) (expr e) exn pl failed <- gu_exn_is_raised exn if failed then throwExn exn else do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr pl peek ptr if ctree == nullPtr then do touchExpr e return [] else do ctree <- pgf_lzr_wrap_linref ctree pl ref <- newIORef ([],[]) withBracketLinFuncs ref exn $ \ppLinFuncs -> pgf_lzr_linearize (concr lang) ctree 0 ppLinFuncs pl failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then return [] else throwExn exn else do (_,bs) <- readIORef ref return (reverse bs) bracketedLinearizeAll :: Concr -> Expr -> [[BracketedString]] bracketedLinearizeAll lang e = unsafePerformIO $ withGuPool $ \pl -> do exn <- gu_new_exn pl cts <- pgf_lzr_concretize (concr lang) (expr e) exn pl failed <- gu_exn_is_raised exn if failed then do touchExpr e throwExn exn else do ref <- newIORef ([],[]) bss <- withBracketLinFuncs ref exn $ \ppLinFuncs -> collect ref cts ppLinFuncs exn pl touchExpr e return bss where collect ref cts ppLinFuncs exn pl = withGuPool $ \tmpPl -> do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr tmpPl peek ptr if ctree == nullPtr then return [] else do ctree <- pgf_lzr_wrap_linref ctree pl pgf_lzr_linearize (concr lang) ctree 0 ppLinFuncs pl failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then collect ref cts ppLinFuncs exn pl else throwExn exn else do (_,bs) <- readIORef ref writeIORef ref ([],[]) bss <- collect ref cts ppLinFuncs exn pl return (reverse bs : bss) withBracketLinFuncs ref exn f = allocaBytes (#size PgfLinFuncs) $ \pLinFuncs -> alloca $ \ppLinFuncs -> do fptr_symbol_token <- wrapSymbolTokenCallback (symbol_token ref) fptr_begin_phrase <- wrapPhraseCallback (begin_phrase ref) fptr_end_phrase <- wrapPhraseCallback (end_phrase ref) fptr_symbol_ne <- wrapSymbolNonExistCallback (symbol_ne exn) fptr_symbol_bind <- wrapSymbolBindCallback (symbol_bind ref) fptr_symbol_meta <- wrapSymbolMetaCallback (symbol_meta ref) (#poke PgfLinFuncs, symbol_token) pLinFuncs fptr_symbol_token (#poke PgfLinFuncs, begin_phrase) pLinFuncs fptr_begin_phrase (#poke PgfLinFuncs, end_phrase) pLinFuncs fptr_end_phrase (#poke PgfLinFuncs, symbol_ne) pLinFuncs fptr_symbol_ne (#poke PgfLinFuncs, symbol_bind) pLinFuncs fptr_symbol_bind (#poke PgfLinFuncs, symbol_capit) pLinFuncs nullPtr (#poke PgfLinFuncs, symbol_meta) pLinFuncs fptr_symbol_meta poke ppLinFuncs pLinFuncs res <- f ppLinFuncs freeHaskellFunPtr fptr_symbol_token freeHaskellFunPtr fptr_begin_phrase freeHaskellFunPtr fptr_end_phrase freeHaskellFunPtr fptr_symbol_ne freeHaskellFunPtr fptr_symbol_bind freeHaskellFunPtr fptr_symbol_meta return res where symbol_token ref _ c_token = do (stack,bs) <- readIORef ref token <- peekUtf8CString c_token writeIORef ref (stack,Leaf token : bs) begin_phrase ref _ c_cat c_fid c_ann c_fun = do (stack,bs) <- readIORef ref writeIORef ref (bs:stack,[]) end_phrase ref _ c_cat c_fid c_ann c_fun = do (bs':stack,bs) <- readIORef ref if null bs then writeIORef ref (stack, bs') else do cat <- peekUtf8CString c_cat let fid = fromIntegral c_fid ann <- peekUtf8CString c_ann fun <- peekUtf8CString c_fun writeIORef ref (stack, Bracket cat fid ann fun (reverse bs) : bs') symbol_ne exn _ = do gu_exn_raise exn gu_exn_type_PgfLinNonExist return () symbol_bind ref _ = do (stack,bs) <- readIORef ref writeIORef ref (stack,BIND : bs) return () symbol_meta ref _ meta_id = do (stack,bs) <- readIORef ref writeIORef ref (stack,Leaf "?" : bs) throwExn exn = do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else do throwIO (PGFError "The abstract tree cannot be linearized") alignWords :: Concr -> Expr -> [(String, [Int])] alignWords lang e = unsafePerformIO $ withGuPool $ \pl -> do exn <- gu_new_exn pl seq <- pgf_align_words (concr lang) (expr e) exn pl touchConcr lang touchExpr e failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then return [] else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else throwIO (PGFError "The abstract tree cannot be linearized") else do len <- (#peek GuSeq, len) seq arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data)) mapM peekAlignmentPhrase arr where peekAlignmentPhrase :: Ptr () -> IO (String, [Int]) peekAlignmentPhrase ptr = do c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr phrase <- peekUtf8CString c_phrase n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr (fids :: [CInt]) <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids)) return (phrase, map fromIntegral fids) printName :: Concr -> Fun -> Maybe String printName lang fun = unsafePerformIO $ withGuPool $ \tmpPl -> do c_fun <- newUtf8CString fun tmpPl c_name <- pgf_print_name (concr lang) c_fun name <- if c_name == nullPtr then return Nothing else fmap Just (peekUtf8CString c_name) touchConcr lang return name -- | List of all functions defined in the abstract syntax functions :: PGF -> [Fun] functions p = unsafePerformIO $ withGuPool $ \tmpPl -> allocaBytes (#size GuMapItor) $ \itor -> do exn <- gu_new_exn tmpPl ref <- newIORef [] fptr <- wrapMapItorCallback (getFunctions ref) (#poke GuMapItor, fn) itor fptr pgf_iter_functions (pgf p) itor exn touchPGF p freeHaskellFunPtr fptr fs <- readIORef ref return (reverse fs) where getFunctions :: IORef [String] -> MapItorCallback getFunctions ref itor key value exn = do names <- readIORef ref name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) -- | List of all functions defined for a category functionsByCat :: PGF -> Cat -> [Fun] functionsByCat p cat = unsafePerformIO $ withGuPool $ \tmpPl -> allocaBytes (#size GuMapItor) $ \itor -> do exn <- gu_new_exn tmpPl ref <- newIORef [] fptr <- wrapMapItorCallback (getFunctions ref) (#poke GuMapItor, fn) itor fptr ccat <- newUtf8CString cat tmpPl pgf_iter_functions_by_cat (pgf p) ccat itor exn touchPGF p freeHaskellFunPtr fptr fs <- readIORef ref return (reverse fs) where getFunctions :: IORef [String] -> MapItorCallback getFunctions ref itor key value exn = do names <- readIORef ref name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) -- | List of all categories defined in the grammar. -- The categories are defined in the abstract syntax -- with the \'cat\' keyword. categories :: PGF -> [Cat] categories p = unsafePerformIO $ withGuPool $ \tmpPl -> allocaBytes (#size GuMapItor) $ \itor -> do exn <- gu_new_exn tmpPl ref <- newIORef [] fptr <- wrapMapItorCallback (getCategories ref) (#poke GuMapItor, fn) itor fptr pgf_iter_categories (pgf p) itor exn touchPGF p freeHaskellFunPtr fptr cs <- readIORef ref return (reverse cs) where getCategories :: IORef [String] -> MapItorCallback getCategories ref itor key value exn = do names <- readIORef ref name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) categoryContext :: PGF -> Cat -> [Hypo] categoryContext p cat = unsafePerformIO $ withGuPool $ \tmpPl -> do c_cat <- newUtf8CString cat tmpPl c_hypos <- pgf_category_context (pgf p) c_cat if c_hypos == nullPtr then return [] else do n_hypos <- (#peek GuSeq, len) c_hypos peekHypos (c_hypos `plusPtr` (#offset GuSeq, data)) 0 n_hypos where peekHypos :: Ptr a -> Int -> Int -> IO [Hypo] peekHypos c_hypo i n | i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekUtf8CString c_ty <- (#peek PgfHypo, type) c_hypo bt <- fmap toBindType ((#peek PgfHypo, bind_type) c_hypo) hs <- peekHypos (plusPtr c_hypo (#size PgfHypo)) (i+1) n return ((bt,cid,Type c_ty (touchPGF p)) : hs) | otherwise = return [] toBindType :: CInt -> BindType toBindType (#const PGF_BIND_TYPE_EXPLICIT) = Explicit toBindType (#const PGF_BIND_TYPE_IMPLICIT) = Implicit categoryProb :: PGF -> Cat -> Float categoryProb p cat = unsafePerformIO $ withGuPool $ \tmpPl -> do c_cat <- newUtf8CString cat tmpPl c_prob <- pgf_category_prob (pgf p) c_cat touchPGF p return (realToFrac c_prob) ----------------------------------------------------------------------------- -- Helper functions fromPgfExprEnum :: Ptr GuEnum -> ForeignPtr GuPool -> IO () -> IO [(Expr, Float)] fromPgfExprEnum enum fpl touch = do pgfExprProb <- alloca $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl peek ptr if pgfExprProb == nullPtr then do finalizeForeignPtr fpl return [] else do expr <- (#peek PgfExprProb, expr) pgfExprProb ts <- unsafeInterleaveIO (fromPgfExprEnum enum fpl touch) prob <- (#peek PgfExprProb, prob) pgfExprProb return ((Expr expr touch,prob) : ts) ----------------------------------------------------------------------- -- Exceptions newtype PGFError = PGFError String deriving (Show, Typeable) instance Exception PGFError ----------------------------------------------------------------------- type LiteralCallback = PGF -> (ConcName,Concr) -> String -> String -> Int -> Maybe (Expr,Float,Int) -- | Callbacks for the App grammar literalCallbacks :: [(AbsName,[(Cat,LiteralCallback)])] literalCallbacks = [("App",[("PN",nerc),("Symb",chunk)])] -- | Named entity recognition for the App grammar -- (based on ../java/org/grammaticalframework/pgf/NercLiteralCallback.java) nerc :: LiteralCallback nerc pgf (lang,concr) sentence lin_idx offset = case consume capitalized (drop offset sentence) of (capwords@(_:_),rest) | not ("Eng" `isSuffixOf` lang && name `elem` ["I","I'm"]) -> if null ls then pn else case cat of "PN" -> retLit (mkApp lemma []) "WeekDay" -> retLit (mkApp "weekdayPN" [mkApp lemma []]) "Month" -> retLit (mkApp "monthPN" [mkApp lemma []]) _ -> Nothing where retLit e = Just (e,0,end_offset) where end_offset = offset+length name pn = retLit (mkApp "SymbPN" [mkApp "MkSymb" [mkStr name]]) ((lemma,cat),_) = maximumBy (compare `on` snd) (reverse ls) ls = [((fun,cat),p) |(fun,_,p)<-lookupMorpho concr name, Just cat <- [functionCat fun], cat/="Nationality"] name = trimRight (concat capwords) _ -> Nothing where -- | Variant of unfoldr consume munch xs = case munch xs of Nothing -> ([],xs) Just (y,xs') -> (y:ys,xs'') where (ys,xs'') = consume munch xs' functionCat f = fmap ((\(_,c,_) -> c) . unType) (functionType pgf f) -- | Callback to parse arbitrary words as chunks (from -- ../java/org/grammaticalframework/pgf/UnknownLiteralCallback.java) chunk :: LiteralCallback chunk _ (_,concr) sentence lin_idx offset = case uncapitalized (drop offset sentence) of Just (word0@(_:_),rest) | null (lookupMorpho concr word) -> Just (expr,0,offset+length word) where word = trimRight word0 expr = mkApp "MkSymb" [mkStr word] _ -> Nothing -- More helper functions trimRight = reverse . dropWhile isSpace . reverse capitalized = capitalized' isUpper uncapitalized = capitalized' (not.isUpper) capitalized' test s@(c:_) | test c = case span (not.isSpace) s of (name,rest1) -> case span isSpace rest1 of (space,rest2) -> Just (name++space,rest2) capitalized' not s = Nothing