Created
August 8, 2018 10:07
-
-
Save n4to4/77ad724cae66abf1cb69003d7cf2a5be to your computer and use it in GitHub Desktop.
scrape.hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: scrape | |
version: 0.1.0.0 | |
-- synopsis: | |
-- description: | |
homepage: https://github.com/githubuser/scrape#readme | |
license: BSD3 | |
license-file: LICENSE | |
author: Author name here | |
maintainer: [email protected] | |
copyright: 2018 Author name here | |
category: Web | |
build-type: Simple | |
cabal-version: >=1.10 | |
extra-source-files: README.md | |
executable scrape | |
hs-source-dirs: src | |
main-is: Main.hs | |
default-language: Haskell2010 | |
build-depends: base >= 4.7 && < 5 | |
, text | |
, http-conduit | |
, html-conduit | |
, xml-conduit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
module Main where | |
import Control.Monad (forM_) | |
import Data.Text (Text) | |
import qualified Data.Text as T | |
import qualified Data.Text.IO as T | |
import Network.HTTP.Simple (httpSink) | |
import Text.HTML.DOM (sinkDoc) | |
import Text.XML (Name) | |
import Text.XML.Cursor | |
attributeContains :: Name -> Text -> Cursor -> Bool | |
attributeContains k v c = elem v values | |
where | |
attrs = attribute k c | |
values = concat $ map T.words attrs | |
extract :: Cursor -> (Cursor -> [Text]) -> [Text] | |
extract c f = | |
c $// element "a" >=> | |
check (attributeContains "class" "recent-entries-title-link") &.// f | |
main :: IO () | |
main = do | |
doc <- httpSink "https://paiza.hatenablog.com/" $ const sinkDoc | |
let cursor = fromDocument doc | |
let contents = extract cursor content | |
let hrefs = extract cursor (attribute "href") | |
forM_ (zip contents hrefs) $ \(c, h) -> T.putStrLn $ T.unlines [c, h] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment