Created
October 4, 2011 08:25
-
-
Save shangaslammi/1261148 to your computer and use it in GitHub Desktop.
Haskell Web Spider example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Control.Exception | |
import Control.Monad | |
import Control.Monad.IO.Class | |
import Data.ByteString.Lazy (ByteString) | |
import Data.ByteString.Lazy.UTF8 (toString) | |
import Data.Function | |
import Data.Enumerator | |
import Data.List | |
import Data.Maybe | |
import Data.Map (Map) | |
import Data.Set (Set) | |
import Network.HTTP.Enumerator | |
import Network.URI | |
import Text.HTML.TagSoup | |
import qualified Data.ByteString.Lazy as B | |
import qualified Data.Enumerator.List as EL | |
import qualified Data.Map as Map | |
import qualified Data.Set as Set | |
type Resource = (URL, Content) | |
type URL = String | |
type Content = ByteString | |
spider :: Int -> URL -> IO (Map URL [URL]) | |
spider count url = go count [url] Map.empty where | |
go 0 _ m = return m | |
go _ [] m = return m | |
go c (url:urls) history | |
| url `Map.member` history = go c urls history | |
| otherwise = do | |
res <- fetchResource url | |
let links = findLinks res | |
queue = urls ++ links | |
go (c-1) queue (Map.insert url links history) | |
fetchResource :: URL -> IO Resource | |
fetchResource url = fmap ((,) url) $ simpleHttp url | |
findLinks :: Resource -> [URL] | |
findLinks (baseurl, c) = normalize . findHrefs $ tags where | |
tags = canonicalizeTags . parseTags . toString $ c | |
findHrefs = mapMaybe href | |
normalize = mapMaybe $ normalizeLink baseurl | |
normalizeLink :: URL -> URL -> Maybe URL | |
normalizeLink base url = toString $ join $ url `relTo` base where | |
relTo a b = liftM2 nonStrictRelativeTo (parseURI a) (parseURIReference b) | |
toString = fmap $ ($"") . uriToString id | |
href :: Tag String -> Maybe String | |
href (TagOpen "a" attrs) = case lookup "rel" attrs of | |
Just "nofollow" -> Nothing | |
_ -> lookup "href" attrs | |
href _ = Nothing | |
---- Improved version using enumerator and iteratee | |
spiderEnum :: MonadIO m => URL -> Enumerator Resource m b | |
spiderEnum url = go [url] Set.empty where | |
go [] _ step = returnI step | |
go (url:urls) visited step | |
| url `Set.member` visited = go urls visited step | |
| otherwise = case step of | |
Continue k -> do | |
res <- liftIO $ fetchResource url | |
let links = findLinks res | |
cont = go (urls ++ links) (Set.insert url visited) | |
k (Chunks [res]) >>== cont | |
_ -> returnI step | |
printURLs :: Iteratee Resource IO () | |
printURLs = EL.mapM_ $ putStrLn . fst | |
main = run_ task where | |
task = spiderEnum "http://leonidasoy.fi" $$ EL.isolate 10 $$ printURLs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment