Skip to content

Instantly share code, notes, and snippets.

@tanakh
Created April 6, 2012 08:42
Show Gist options
  • Select an option

  • Save tanakh/2318233 to your computer and use it in GitHub Desktop.

Select an option

Save tanakh/2318233 to your computer and use it in GitHub Desktop.
tagsoupとregex-tdfaでHTMLからリンク抽出 ref: http://qiita.com/items/079122e196e36e633a50
{-# LANGUAGE OverloadedStrings, QuasiQuotes #-}
import Control.Applicative
import Control.Monad
import qualified Data.ByteString.Lazy.Char8 as B
import Network.HTTP.Conduit
import System.Cmd
import System.Environment
import System.Process.QQ
import Text.HTML.TagSoup
import Text.HTML.TagSoup.Tree
import Text.Printf
import Text.Regex.TDFA
baseUrl = "http://dumps.wikimedia.org/"
extractLinks url regex = do
body <- B.unpack <$> simpleHttp url
let tree = tagTree $ parseTags body
return
[ (name, link)
| (TagBranch "a" attrs [TagLeaf (TagText name)]) <- universeTree tree
, name =~ (regex :: String)
, let Just link = lookup "href" attrs
]
main :: IO ()
main = do
args <- getArgs
case args of
[dest] -> do
[cmd|rm -rf #{dest}|]
[cmd|mkdir -p #{dest}|]
langs <- extractLinks (baseUrl ++ "backup-index.html") ".+wiki$"
forM_ (zip [1..] langs) $ \(ix, (name, url)) -> do
printf "[%d/%d]: %s\n" (ix :: Int) (length langs) name
links <- extractLinks (baseUrl ++ url) ".+pages-articles\\.xml\\..*"
forM_ links $ \(name, url) -> do
let aurl = baseUrl ++ url
putStrLn $ "> " ++ name ++ ": " ++ url
system $ "aria2c -d " ++ dest ++ " " ++ baseUrl ++ url
_ -> putStrLn "usage: runhaskell main.hs <dest-dir>"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment