Haskell | |
![]()
最終的なコードと利用サンプル,そしてそれらをビルドする cabal ファイルを最後に載せておく.
libxml.cabal
Name: libxml Version: 0.1 License: BSD3 Category: Text Build-type: Simple Cabal-version: >=1.6 Executable sample Main-is: Main.hs Other-Modules: LibXml Build-depends: base == 4.*, bytestring Build-tools: hsc2hs Hs-source-dirs: src Includes: wrapper.h Include-dirs: include C-sources: cbits/wrapper.c Extra-libraries: xml2
{-# LANGUAGE ForeignFunctionInterface, EmptyDataDecls #-} module LibXml ( HtmlParserOption, recover, noError, noWarning, pedantic, noBlanks, noNet, noImplied, compact , XmlDoc, Node , parseHTML , xpath , textContent ) where #include <libxml/HTMLparser.h> #include <libxml/xpath.h> import Control.Applicative ((<$>)) import Control.Exception (bracket) import Data.Bits ((.|.)) import System.IO.Unsafe (unsafePerformIO) import Foreign.Ptr (Ptr, nullPtr) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.C.String (CString, withCString) import Foreign.C.Types (CInt) import Foreign.Concurrent (newForeignPtr) import Foreign.Storable (peekByteOff) import Foreign.Marshal.Array (peekArray) import qualified Data.ByteString as B data XmlDocTag newtype XmlDoc = MkXmlDoc (ForeignPtr XmlDocTag) data NodeTag newtype Node = MkNode (Ptr NodeTag) data NodeSetTag data XPathContextTag data XPathObjectTag newtype HtmlParserOption = HtmlParserOption { unHtmlParserOption :: CInt } #{enum HtmlParserOption, HtmlParserOption , recover = HTML_PARSE_RECOVER , noError = HTML_PARSE_NOERROR , noWarning = HTML_PARSE_NOWARNING , pedantic = HTML_PARSE_PEDANTIC , noBlanks = HTML_PARSE_NOBLANKS , noNet = HTML_PARSE_NONET , noImplied = HTML_PARSE_NOIMPLIED , compact = HTML_PARSE_COMPACT } foldOpts :: [HtmlParserOption] -> CInt foldOpts = foldl (\a -> (a .|. ) . unHtmlParserOption) 0 withMaybeCString :: Maybe String -> (CString -> IO a) -> IO a withMaybeCString (Just str) act = withCString str act withMaybeCString Nothing act = act nullPtr parseHTML :: B.ByteString -> String -> Maybe String -> [HtmlParserOption] -> XmlDoc parseHTML src url encoding opts = unsafePerformIO $ do ptr <- B.useAsCStringLen src $ \(cstr,len) -> withMaybeCString encoding $ \e -> withCString url $ \u -> c_htmlReadMemory cstr (fromIntegral len) e u (foldOpts opts) fptr <- newForeignPtr ptr $ c_xmlFreeDoc ptr return $ MkXmlDoc fptr foreign import ccall "htmlReadMemory" c_htmlReadMemory :: CString -> CInt -> CString -> CString -> CInt -> IO (Ptr XmlDocTag) foreign import ccall "xmlFreeDoc" c_xmlFreeDoc :: Ptr XmlDocTag -> IO () xpath :: String -> XmlDoc -> Maybe ([Node],Int) xpath s (MkXmlDoc docPtr) = unsafePerformIO $ do ctx <- withForeignPtr docPtr c_xmlXPathNewContext obj <- withCString s (`c_xmlXPathEvalExpression` ctx) if obj == nullPtr then return Nothing else do nodes <- (#peek xmlXPathObject, nodesetval) obj :: IO (Ptr NodeSetTag) if nodes == nullPtr then c_xmlXPathFreeObject obj >> return Nothing else do nr <- fromIntegral <$> ((#peek xmlNodeSet, nodeNr) nodes :: IO CInt) tab <- (#peek xmlNodeSet, nodeTab) nodes :: IO (Ptr (Ptr NodeTag)) ns <- map MkNode <$> peekArray nr tab c_xmlXPathFreeObject obj return $ Just (ns,nr) foreign import ccall "xmlXPathNewContext" c_xmlXPathNewContext :: Ptr XmlDocTag -> IO (Ptr XPathContextTag) foreign import ccall "xmlXPathEvalExpression" c_xmlXPathEvalExpression :: CString -> Ptr XPathContextTag -> IO (Ptr XPathObjectTag) foreign import ccall "xmlXPathFreeObject" c_xmlXPathFreeObject :: Ptr XPathObjectTag -> IO () textContent :: Node -> B.ByteString textContent (MkNode ptr) = unsafePerformIO $ bracket (c_xmlNodeGetContent ptr) c_xmlFree B.packCString foreign import ccall "xmlNodeGetContent" c_xmlNodeGetContent :: Ptr NodeTag -> IO CString foreign import ccall "xmlFree_wrap" c_xmlFree :: Ptr a -> IO ()
include/wrapper.h
#ifndef LIBXML_WRAPPER_H #define LIBXML_WRAPPER_H void xmlFree_wrap(void *mem); #endif
cbits/wrapper.c
#include <libxml/globals.h> #include "wrapper.h" void xmlFree_wrap(void *mem) { xmlFree(mem); }
src/Main.hs
{-# LANGUAGE OverloadedStrings #-} module Main where import Control.Monad (forM_) import System.Environment (getArgs) import qualified Data.ByteString.Char8 as B8 import LibXml main :: IO () main = do args <- getArgs forM_ args $ \path -> do str <- B8.readFile path let doc = parseHTML str "" (Just "UTF-8") [recover, noError, noWarning, noNet] case xpath "//a[@href]" doc of Nothing -> putStrLn $ path ++ ": failed" Just (ns,nr) -> putStrLn (path ++ ": " ++ show nr ++ " nodes") >> mapM_ (B8.putStrLn . B8.append " " . textContent) ns