
时间:2013-03-15 20:56:22

标签: haskell web-scraping monads




scrapePage :: String -> IO ()
scrapePage url = do
  doc <- fromUrl url
  title <- liftM headMay $ runX $ doc >>> css "head.title" >>> getText
  when (isNothing title) (return ())
  date <- liftM headMay $ runX $ doc >>> css "span.dateTime" ! "data-utc"
  when (isNothing date) (return ())
  -- etc
  -- make page object and send it to db
  return ()



3 个答案:

答案 0 :(得分:16)




scrapePage :: String -> IO ()
scrapePage url = do
  doc <- fromUrl url
  title <- liftM headMay $ runX $ doc >>> css "head.title" >>> getText
  if (isNothing title) then return () else do
   date <- liftM headMay $ runX $ doc >>> css "span.dateTime" ! "data-utc"
   if (isNothing date) then return () else do
     -- etc
     -- make page object and send it to db
     return ()


scrapePage url = do
  doc <- fromUrl url
  title <- liftM headMay $ runX $ doc >>> css "head.title" >>> getText
  unless (isNothing title) do
    date <- liftM headMay $ runX $ doc >>> css "span.dateTime" ! "data-utc"
    unless (isNothing date) do
      -- etc
      -- make page object and send it to db
      return ()

这里的一般问题是IO monad没有控制效果(例外情况除外)。另一方面,你可以使用may monad变换器

scrapePage url = liftM (maybe () id) . runMaybeT $ do
  doc <- liftIO $ fromUrl url
  title <- liftIO $ liftM headMay $ runX $ doc >>> css "head.title" >>> getText
  guard (isJust title)
  date <- liftIO $ liftM headMay $ runX $ doc >>> css "span.dateTime" ! "data-utc"
  guard (isJust date)
  -- etc
  -- make page object and send it to db
  return ()


scrapePage :: String -> IO ()
scrapePage url = runContT return $ do
  doc <- fromUrl url
  title <- liftM headMay $ runX $ doc >>> css "head.title" >>> getText
  when (isNothing title) $ callCC ($ ())
  date <- liftM headMay $ runX $ doc >>> css "span.dateTime" ! "data-utc"
  when (isNothing date) $ callCC ($ ())
  -- etc
  -- make page object and send it to db
  return ()


答案 1 :(得分:12)


import Control.Monad.Trans.Class -- from transformers package
import Control.Error.Util        -- from errors package

scrapePage :: String -> IO ()
scrapePage url = maybeT (return ()) return $ do
  doc <- lift $ fromUrl url
  title <- liftM headMay $ lift . runX $ doc >>> css "head.title" >>> getText
  guard . not $ isNothing title
  date <- liftM headMay $ lift . runX $ doc >>> css "span.dateTime" ! "data-utc"
  guard . not $ isNothing date
  -- etc
  -- make page object and send it to db
  return ()

为了在您提前退货时更加灵活地使用返回值,请使用throwError / eitherT / EitherT代替mzero / maybeT / {{3 }}。 (虽然那时你不能使用MaybeT。)


答案 2 :(得分:2)

我从未使用过Haskell,但似乎很容易戒掉。试试when (isNothing date) $ exit ()。如果这也不起作用,那么请确保您的陈述是正确的。另请参阅此网站了解更多信息:Breaking From loop