Last active
August 29, 2015 14:21
-
-
Save reneklacan/262eef04a3bc67607dd8 to your computer and use it in GitHub Desktop.
Crawl a page and print a JSON with site info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
import Data.Tree.NTree.TypeDefs | |
import Text.XML.HXT.Core | |
import Data.Text hiding (length,foldl) | |
import qualified Data.HashMap.Strict as M | |
import Data.Aeson | |
import Text.HandsomeSoup | |
data SiteInfo = SiteInfo | |
{ title :: Text | |
, githubUrl :: Text | |
, articleCount :: Int | |
, articleNames :: [Text] | |
} deriving Show | |
instance ToJSON SiteInfo where | |
toJSON (SiteInfo title githubUrl articleCount articleNames) = | |
object | |
[ "title" .= title | |
, "github_url" .= githubUrl | |
, "article_count" .= articleCount | |
, "article_names" .= articleNames | |
] | |
siteTitle :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String | |
siteTitle tree = tree >>> css "a.site-title" //> getText | |
siteGithubUrl :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String | |
siteGithubUrl tree = tree >>> css "a.fa-github" ! "href" | |
siteArticleNames :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String | |
siteArticleNames tree = tree >>> css ".post-title" //> getText | |
getList | |
:: IOSArrow XmlTree (NTree XNode) | |
-> (IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String) | |
-> IO [String] | |
getList doc selector = runX $ selector doc | |
getOne | |
:: IOSArrow XmlTree (NTree XNode) | |
-> (IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String) | |
-> IO String | |
getOne doc selector = fmap (foldl (++) "") $ getList doc selector | |
getInfo :: IOSArrow XmlTree (NTree XNode) -> IO SiteInfo | |
getInfo doc = do | |
sTitle <- getOne doc siteTitle | |
sGithubUrl <- getOne doc siteGithubUrl | |
sArticleNames <- getList doc siteArticleNames | |
return | |
SiteInfo | |
{ title = pack sTitle | |
, githubUrl = pack sGithubUrl | |
, articleCount = length sArticleNames | |
, articleNames = fmap pack sArticleNames | |
} | |
get :: String -> IO (IOSArrow XmlTree (NTree XNode)) | |
get url = do | |
return $ fromUrl url | |
main :: IO () | |
main = do | |
doc <- get "http://rene.klacan.sk" | |
hash <- fmap encode $ getInfo doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment