Skip to content

Instantly share code, notes, and snippets.

@cocreature
Created December 28, 2015 09:11
Show Gist options
  • Save cocreature/e756e4775fa682daf652 to your computer and use it in GitHub Desktop.
Save cocreature/e756e4775fa682daf652 to your computer and use it in GitHub Desktop.
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Main where
import qualified Pipes as P
import qualified Pipes.Prelude as P
import qualified Data.ByteString.Lazy as BSL
import Data.Monoid
import qualified Data.Text as T
import Network.HTTP.Conduit
import Network.URI
import Text.HTML.DOM
import Text.Regex.PCRE.Heavy
import Text.XML hiding (parseLBS)
import Text.XML.Cursor
main :: IO ()
main =
do P.runEffect $
indexProducer P.>-> bookPipe P.>-> bookInfoPipe P.>-> P.map bookInfoAria P.>-> P.map T.unpack P.>-> P.stdoutLn
bookInfoAria :: BookInfo -> T.Text
bookInfoAria (BookInfo t u) = u <> "\n out=" <> t <> ".pdf"
data BookInfo =
BookInfo {title :: !T.Text
,url :: !T.Text}
deriving (Show,Eq,Ord)
testBook :: String
testBook = "http://link.springer.com/book/10.1007/978-1-4684-9440-2"
initialSite :: String
initialSite =
"http://link.springer.com/search/page/1?facet-series=%22136%22&facet-content-type=%22Book%22"
indexProducer :: P.Producer String IO ()
indexProducer = P.yield initialSite >> P.unfoldr bookIndex initialSite
bookPipe :: P.Pipe String String IO ()
bookPipe = P.mapM bookList P.>-> P.concat P.>-> P.map ("http://link.springer.com" <>) P.>-> P.map T.unpack
bookInfoPipe :: P.Pipe String BookInfo IO ()
bookInfoPipe = P.mapM bookDownload P.>-> P.concat
bookDownload :: String -> IO (Maybe BookInfo)
bookDownload url =
do result <- simpleHttp url
let doc = parseLBS result
cursor = fromDocument doc
link =
concat $
cursor $//
(element "a" >=> attributeIs "id" "toc-download-book-pdf-link") &|
attribute "href"
title =
T.concat $ cursor $// attributeIs "id" "title" &/ content
case link of
[] -> pure Nothing
(x:_) -> pure . Just $ BookInfo title ("http://link.springer.com" <> x)
bookIndex :: String -> IO (Either () (String,String))
bookIndex url =
do result <- simpleHttp url
case nextPage result of
Nothing -> pure $ Left ()
Just page -> pure $ Right (page,page)
bookList :: String -> IO [T.Text]
bookList url =
do result <- simpleHttp url
let doc = parseLBS result
cursor = fromDocument doc
pure $ cursor $// (element "a" >=> attributeIs "class" "title") &|
T.concat .
attribute "href"
nextPage :: BSL.ByteString -> Maybe String
nextPage bs =
let link =
cursor $// (element "link" >=> attributeIs "rel" "next") &|
attribute "href"
in case concat link of
[] -> Nothing
(x:_) ->
Just $
"http://link.springer.com" <>
((sub [re|amp;|]
("" :: String)
(T.unpack x)) :: String)
where doc = parseLBS bs
cursor = fromDocument doc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment