Last active
November 7, 2016 15:56
-
-
Save DuckOfDoom/b8839a71514cf1858dc00840a0d69506 to your computer and use it in GitHub Desktop.
Html parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# OPTIONS_GHC -Wall #-} | |
{-# TemplateHaskell #-} | |
module Main where | |
import Control.Lens (makeLenses, (.~), (^.)) | |
import Data.List.Split (chunksOf) | |
import Data.ByteString.Lazy (ByteString) | |
import qualified Data.ByteString.Lazy as LBS (writeFile, toStrict) | |
import Data.Text (unpack) | |
import Data.Text.Encoding (decodeUtf8) | |
import qualified Network.Wreq as Wreq (get, responseBody) | |
import Text.HTML.TagSoup (Tag, fromAttrib, isTagCloseName, | |
parseTags, renderTags, (~/=)) | |
data Event = Event | |
{ name :: String | |
, location :: String | |
, date :: String | |
, link :: String | |
} | |
deriving Show | |
makeLenses ''Event | |
url :: String | |
url = "http://us.battle.net/hearthstone/en/fireside-gatherings?country=RU#fireside-gatherings" | |
getHtml :: IO ByteString | |
getHtml = do | |
response <- Wreq.get url | |
return $ (response ^. Wreq.responseBody) | |
findEventsTable :: ByteString -> [Tag ByteString] | |
findEventsTable s = takeWhile (~/= "</div>") . drop 1 . dropWhile (~/= "</div>") . dropWhile (~/= "<div class=meetups-event-table>") . parseTags | |
splitEvents :: [Tag ByteString] -> [[Tag ByteString]] | |
splitEvents = chunksOf 32 | |
parseEvent :: [Tag ByteString] -> Event | |
parseEvent tags = Event name location date link | |
where link = "http://us.battle.net/" ++ (fromAttrib "href" (head tags)) | |
name = "" | |
location = "" | |
date = "" | |
test :: IO () | |
test = do | |
tags <- (splitEvents . findEventsTable) <$> getHtml | |
LBS.writeFile "test.html" $ renderTags $ head $ tags | |
downloadHtml :: IO () | |
downloadHtml = do | |
getHtml >>= (LBS.writeFile "downloaded.html") | |
main :: IO () | |
main = return () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment