Created
December 28, 2015 09:11
-
-
Save cocreature/e756e4775fa682daf652 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
{-# LANGUAGE QuasiQuotes #-} | |
module Main where | |
import qualified Pipes as P | |
import qualified Pipes.Prelude as P | |
import qualified Data.ByteString.Lazy as BSL | |
import Data.Monoid | |
import qualified Data.Text as T | |
import Network.HTTP.Conduit | |
import Network.URI | |
import Text.HTML.DOM | |
import Text.Regex.PCRE.Heavy | |
import Text.XML hiding (parseLBS) | |
import Text.XML.Cursor | |
main :: IO () | |
main = | |
do P.runEffect $ | |
indexProducer P.>-> bookPipe P.>-> bookInfoPipe P.>-> P.map bookInfoAria P.>-> P.map T.unpack P.>-> P.stdoutLn | |
bookInfoAria :: BookInfo -> T.Text | |
bookInfoAria (BookInfo t u) = u <> "\n out=" <> t <> ".pdf" | |
data BookInfo = | |
BookInfo {title :: !T.Text | |
,url :: !T.Text} | |
deriving (Show,Eq,Ord) | |
testBook :: String | |
testBook = "http://link.springer.com/book/10.1007/978-1-4684-9440-2" | |
initialSite :: String | |
initialSite = | |
"http://link.springer.com/search/page/1?facet-series=%22136%22&facet-content-type=%22Book%22" | |
indexProducer :: P.Producer String IO () | |
indexProducer = P.yield initialSite >> P.unfoldr bookIndex initialSite | |
bookPipe :: P.Pipe String String IO () | |
bookPipe = P.mapM bookList P.>-> P.concat P.>-> P.map ("http://link.springer.com" <>) P.>-> P.map T.unpack | |
bookInfoPipe :: P.Pipe String BookInfo IO () | |
bookInfoPipe = P.mapM bookDownload P.>-> P.concat | |
bookDownload :: String -> IO (Maybe BookInfo) | |
bookDownload url = | |
do result <- simpleHttp url | |
let doc = parseLBS result | |
cursor = fromDocument doc | |
link = | |
concat $ | |
cursor $// | |
(element "a" >=> attributeIs "id" "toc-download-book-pdf-link") &| | |
attribute "href" | |
title = | |
T.concat $ cursor $// attributeIs "id" "title" &/ content | |
case link of | |
[] -> pure Nothing | |
(x:_) -> pure . Just $ BookInfo title ("http://link.springer.com" <> x) | |
bookIndex :: String -> IO (Either () (String,String)) | |
bookIndex url = | |
do result <- simpleHttp url | |
case nextPage result of | |
Nothing -> pure $ Left () | |
Just page -> pure $ Right (page,page) | |
bookList :: String -> IO [T.Text] | |
bookList url = | |
do result <- simpleHttp url | |
let doc = parseLBS result | |
cursor = fromDocument doc | |
pure $ cursor $// (element "a" >=> attributeIs "class" "title") &| | |
T.concat . | |
attribute "href" | |
nextPage :: BSL.ByteString -> Maybe String | |
nextPage bs = | |
let link = | |
cursor $// (element "link" >=> attributeIs "rel" "next") &| | |
attribute "href" | |
in case concat link of | |
[] -> Nothing | |
(x:_) -> | |
Just $ | |
"http://link.springer.com" <> | |
((sub [re|amp;|] | |
("" :: String) | |
(T.unpack x)) :: String) | |
where doc = parseLBS bs | |
cursor = fromDocument doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment