Skip to content

Instantly share code, notes, and snippets.

@merijn
Created July 9, 2015 08:20
Show Gist options
  • Save merijn/d8188ddd129718ffcfb0 to your computer and use it in GitHub Desktop.
Save merijn/d8188ddd129718ffcfb0 to your computer and use it in GitHub Desktop.
Concurrent webscraper
{-# LANGUAGE OverloadedStrings #-}
import Control.Concurrent.Async
import Control.Concurrent.QSem
import Control.Monad
import Control.Monad.Catch
import Control.Monad.Trans
import Control.Monad.Reader
import Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as LBS
import Data.Foldable
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Conc.Sync
import Network (withSocketsDo)
import Network.URI
import Network.HTTP.Conduit hiding (withManager)
import Text.XML.Cursor
import Text.HTML.DOM
import System.FilePath
instance MonadIO Concurrently where
liftIO = Concurrently
type Scraper = ReaderT (QSem, Manager) Concurrently
urls :: [Text]
urls = take 1 [baseURL <> T.pack (show x) | x <- [(1::Int)..51]]
where
baseURL = "http://example.com/base/url"
fetch :: Text -> Scraper ByteString
fetch url = do
(qsem, manager) <- ask
src <- liftIO $ parseUrl (T.unpack url)
liftIO . bracket_ (waitQSem qsem) (signalQSem qsem) $
responseBody <$> httpLbs src manager
runScraper :: Int -> Scraper a -> IO a
runScraper count scraper = do
qsem <- newQSem count
bracket (newManager conduitManagerSettings) closeManager $ \manager ->
runConcurrently $ runReaderT scraper (qsem, manager)
grabComic :: Text -> Scraper ()
grabComic url = do
cursor <- fromDocument . parseLBS <$> fetch url
let [comicSrc] = cursor
$// element "div"
>=> attributeIs "id" "comic"
>=> descendant
>=> element "img"
>=> attribute "src"
Just comicURI = parseURI . T.unpack $ comicSrc
outputFile = takeFileName $ uriPath comicURI
fetch comicSrc >>= liftIO . LBS.writeFile outputFile
crawlArchive :: Text -> Scraper ()
crawlArchive url = do
cursor <- fromDocument . parseLBS <$> fetch url
let comicUrls = cursor
$// element "A"
>=> attribute "HREF"
liftIO $ print comicUrls
--traverse_ grabComic comicUrls
main :: IO ()
main = withSocketsDo $ do
getNumProcessors >>= setNumCapabilities
runScraper 100 $ crawlArchive "http://example.com/
@gregnwosu
Copy link

lovely, and thanks greg` from #haskell

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment