Created
April 6, 2012 08:42
-
-
Save tanakh/2318233 to your computer and use it in GitHub Desktop.
tagsoupとregex-tdfaでHTMLからリンク抽出 ref: http://qiita.com/items/079122e196e36e633a50
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| {-# LANGUAGE OverloadedStrings, QuasiQuotes #-} | |
| import Control.Applicative | |
| import Control.Monad | |
| import qualified Data.ByteString.Lazy.Char8 as B | |
| import Network.HTTP.Conduit | |
| import System.Cmd | |
| import System.Environment | |
| import System.Process.QQ | |
| import Text.HTML.TagSoup | |
| import Text.HTML.TagSoup.Tree | |
| import Text.Printf | |
| import Text.Regex.TDFA | |
| baseUrl = "http://dumps.wikimedia.org/" | |
| extractLinks url regex = do | |
| body <- B.unpack <$> simpleHttp url | |
| let tree = tagTree $ parseTags body | |
| return | |
| [ (name, link) | |
| | (TagBranch "a" attrs [TagLeaf (TagText name)]) <- universeTree tree | |
| , name =~ (regex :: String) | |
| , let Just link = lookup "href" attrs | |
| ] | |
| main :: IO () | |
| main = do | |
| args <- getArgs | |
| case args of | |
| [dest] -> do | |
| [cmd|rm -rf #{dest}|] | |
| [cmd|mkdir -p #{dest}|] | |
| langs <- extractLinks (baseUrl ++ "backup-index.html") ".+wiki$" | |
| forM_ (zip [1..] langs) $ \(ix, (name, url)) -> do | |
| printf "[%d/%d]: %s\n" (ix :: Int) (length langs) name | |
| links <- extractLinks (baseUrl ++ url) ".+pages-articles\\.xml\\..*" | |
| forM_ links $ \(name, url) -> do | |
| let aurl = baseUrl ++ url | |
| putStrLn $ "> " ++ name ++ ": " ++ url | |
| system $ "aria2c -d " ++ dest ++ " " ++ baseUrl ++ url | |
| _ -> putStrLn "usage: runhaskell main.hs <dest-dir>" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment