Skip to content

Instantly share code, notes, and snippets.

@hiteshjasani
Created September 29, 2014 23:42
Show Gist options
  • Save hiteshjasani/9405b7812acee626cb18 to your computer and use it in GitHub Desktop.
Save hiteshjasani/9405b7812acee626cb18 to your computer and use it in GitHub Desktop.
Parsing html with Haskell
---------------------------------------------------------------
-- |
-- Module : Editors
-- Copyright : (c) Hitesh Jasani, 2007
-- License : BSD3
--
-- Maintainer : Hitesh Jasani <[email protected]>
-- Stability : provisional
-- Portability :
--
-- Created : 2007-11-06
-- Version : 0.1
--
-- Parse a wikipedia page and pull out all the editors.
--
-- This code depends upon Neil Mitchell's TagSoup library
-- from darcs (circa Nov 1, 2007)
---------------------------------------------------------------
module Main where
import Control.Arrow ( (&&&) )
import Text.HTML.TagSoup
import qualified Text.HTML.TagSoup.Match as M
import Text.HTML.Download ( openURL )
import System.Environment ( getArgs )
main = do
args <- getArgs
case args of
["save"] -> cachePage wikiHaskell cachefile
otherwise -> do src <- readCache cachefile
mapM_ showEdits . map (getName &&& getComment) $ tags src
where
tags = findSpans . parseTags
findSpans = sections (~== TagOpen "span" [("class","history-user")])
showEdits (n,c) = putStrLn $ n ++ ": " ++ c
getName = innerText . take 1 . drop 2
getComment ts =
let cSt = runtil ts
in if (isTagCloseName "li" $ head cSt)
then "(no comment)"
else reverse . drop 2 . reverse . concat . map to_s .
takeWhile (~/= TagOpen "span" [("class", "mw-history-undo")]) $ cSt
where
runtil ts = dropWhile (\x -> (x ~/= (TagOpen "span" [("class","comment")]))
&& not (M.tagClose (=="li") x)) ts
to_s (TagOpen "span" _) = ""
to_s (TagText s) = s
to_s (TagOpen "a" _) = ""
to_s (TagClose s) = ""
readCache fn = readFile fn
cachefile = "page.cache"
cachePage url fn = do
src <- openURL url
writeFile fn src
wikiHaskell = "http://en.wikipedia.org/w/index.php?title=Haskell_%28programming_language%29&action=history"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment