Skip to content

Instantly share code, notes, and snippets.

@n4to4
Created August 8, 2018 10:07
Show Gist options
  • Save n4to4/77ad724cae66abf1cb69003d7cf2a5be to your computer and use it in GitHub Desktop.
Save n4to4/77ad724cae66abf1cb69003d7cf2a5be to your computer and use it in GitHub Desktop.
scrape.hs
name: scrape
version: 0.1.0.0
-- synopsis:
-- description:
homepage: https://github.com/githubuser/scrape#readme
license: BSD3
license-file: LICENSE
author: Author name here
maintainer: [email protected]
copyright: 2018 Author name here
category: Web
build-type: Simple
cabal-version: >=1.10
extra-source-files: README.md
executable scrape
hs-source-dirs: src
main-is: Main.hs
default-language: Haskell2010
build-depends: base >= 4.7 && < 5
, text
, http-conduit
, html-conduit
, xml-conduit
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Control.Monad (forM_)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Network.HTTP.Simple (httpSink)
import Text.HTML.DOM (sinkDoc)
import Text.XML (Name)
import Text.XML.Cursor
attributeContains :: Name -> Text -> Cursor -> Bool
attributeContains k v c = elem v values
where
attrs = attribute k c
values = concat $ map T.words attrs
extract :: Cursor -> (Cursor -> [Text]) -> [Text]
extract c f =
c $// element "a" >=>
check (attributeContains "class" "recent-entries-title-link") &.// f
main :: IO ()
main = do
doc <- httpSink "https://paiza.hatenablog.com/" $ const sinkDoc
let cursor = fromDocument doc
let contents = extract cursor content
let hrefs = extract cursor (attribute "href")
forM_ (zip contents hrefs) $ \(c, h) -> T.putStrLn $ T.unlines [c, h]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment