Skip to content

Instantly share code, notes, and snippets.

@edvardm
Created October 3, 2011 09:36
Show Gist options
  • Save edvardm/1258782 to your computer and use it in GitHub Desktop.
Save edvardm/1258782 to your computer and use it in GitHub Desktop.
My take on Haskell: Functional Programming, Solid Code, Big Data by Bryan O'Sullivan @ Strange Loop 2011
import Network.HTTP.Enumerator
import Data.ByteString.Lazy.UTF8 as BS
import Text.HTML.TagSoup
import Control.Monad
import Data.Maybe
import Network.URI
download :: String -> IO String
download uri = liftM BS.toString $ simpleHttp uri
savePage :: String -> Int -> IO ()
savePage html i = writeFile (mkFileName i) html
mkFileName :: Int -> FilePath
mkFileName k = "page-" ++ show k ++ ".html"
isOpenTag :: Tag t -> Bool
isOpenTag (TagOpen _ _) = True
isOpenTag _ = False
canonLnk :: String -> String -> Maybe String
canonLnk referer path = do
r <- parseURI referer
p <- parseURIReference path
u <- nonStrictRelativeTo p r
return (uriToString id u "")
scrapePage :: String -> IO [String]
scrapePage url = liftM (getLinks url) $ download url
getLinks :: String -> String -> [String]
getLinks url =
catMaybes .
map (canonLnk url) .
filter (not . null) .
hrefs .
filter (not . noFollow) .
limitToTags "a" .
canonicalizeTags . parseTags
hrefs :: [Tag String] -> [String]
hrefs = map $ fromAttrib "href"
noFollow :: Tag String -> Bool
noFollow t = fromAttrib "rel" t == "nofollow"
limitToTags :: String -> [Tag String] -> [Tag String]
limitToTags t = filter $ isTagOpenName t
main :: IO ()
main = do
parsed <- scrapePage "http://hs.fi/"
savePage (unwords parsed) 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment