Last active
January 30, 2016 07:26
-
-
Save lotz84/c870d6a3f9a97e0fc8b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
import Control.Concurrent (threadDelay) | |
import Control.Lens | |
import Control.Monad.IO.Class (liftIO) | |
import Control.Monad.State | |
import Data.List (nub, isPrefixOf) | |
import Data.Text.Lazy.Encoding (decodeUtf8) | |
import Data.Text.Lens (unpacked) | |
import qualified Network.Wreq as Wreq | |
import Text.Taggy.Lens | |
baseUrl :: String | |
baseUrl = "http://www.example.com/" | |
scraping :: Int -> String -> StateT [String] IO () | |
scraping 0 _ = pure () -- 終了条件 | |
scraping n url = do | |
seen <- gets (elem url) -- 既に訪れたどうか | |
unless seen $ do -- 既に訪れていれば何もしない | |
liftIO . putStrLn $ show n ++ ": " ++ url -- 何回目でどこに来たかを表示する | |
res <- liftIO $ Wreq.get url -- コンテンツをダウンロードする | |
seen <- modify (url:) >> get -- 既に見たURLのリストを更新してseenに束縛する | |
let next = nub $ res ^.. Wreq.responseBody . to decodeUtf8 . html -- HTML | |
. allNamed (only "a") . attrs . at "href" . _Just -- 全てのaタグからhref属性の値を存在すれば取り出す | |
. unpacked . filtered (isPrefixOf baseUrl) -- Stringに変換してルートがbaseUrlに一致するものだけ取り出す | |
. filtered (not . (`elem` seen)) -- 既に訪れたところは取り出さない | |
liftIO $ threadDelay 1000000 -- DoS攻撃にならないように1秒間待つ | |
mapM_ (scraping (n-1)) next -- 再帰的にスクレイピングしていく | |
main :: IO () | |
main = evalStateT (scraping 3 baseUrl) [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment