Created
September 29, 2014 23:42
-
-
Save hiteshjasani/9405b7812acee626cb18 to your computer and use it in GitHub Desktop.
Parsing html with Haskell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--------------------------------------------------------------- | |
-- | | |
-- Module : Editors | |
-- Copyright : (c) Hitesh Jasani, 2007 | |
-- License : BSD3 | |
-- | |
-- Maintainer : Hitesh Jasani <[email protected]> | |
-- Stability : provisional | |
-- Portability : | |
-- | |
-- Created : 2007-11-06 | |
-- Version : 0.1 | |
-- | |
-- Parse a wikipedia page and pull out all the editors. | |
-- | |
-- This code depends upon Neil Mitchell's TagSoup library | |
-- from darcs (circa Nov 1, 2007) | |
--------------------------------------------------------------- | |
module Main where | |
import Control.Arrow ( (&&&) ) | |
import Text.HTML.TagSoup | |
import qualified Text.HTML.TagSoup.Match as M | |
import Text.HTML.Download ( openURL ) | |
import System.Environment ( getArgs ) | |
main = do | |
args <- getArgs | |
case args of | |
["save"] -> cachePage wikiHaskell cachefile | |
otherwise -> do src <- readCache cachefile | |
mapM_ showEdits . map (getName &&& getComment) $ tags src | |
where | |
tags = findSpans . parseTags | |
findSpans = sections (~== TagOpen "span" [("class","history-user")]) | |
showEdits (n,c) = putStrLn $ n ++ ": " ++ c | |
getName = innerText . take 1 . drop 2 | |
getComment ts = | |
let cSt = runtil ts | |
in if (isTagCloseName "li" $ head cSt) | |
then "(no comment)" | |
else reverse . drop 2 . reverse . concat . map to_s . | |
takeWhile (~/= TagOpen "span" [("class", "mw-history-undo")]) $ cSt | |
where | |
runtil ts = dropWhile (\x -> (x ~/= (TagOpen "span" [("class","comment")])) | |
&& not (M.tagClose (=="li") x)) ts | |
to_s (TagOpen "span" _) = "" | |
to_s (TagText s) = s | |
to_s (TagOpen "a" _) = "" | |
to_s (TagClose s) = "" | |
readCache fn = readFile fn | |
cachefile = "page.cache" | |
cachePage url fn = do | |
src <- openURL url | |
writeFile fn src | |
wikiHaskell = "http://en.wikipedia.org/w/index.php?title=Haskell_%28programming_language%29&action=history" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment