Created
April 1, 2016 19:17
-
-
Save clrnd/3f5221fe2b053265b0b935da8bc374f8 to your computer and use it in GitHub Desktop.
Scalpel scraper for Dr. Who IMDB ratings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- stack --resolver lts-5.8 runghc --package "aeson scalpel bytestring text" | |
{-# LANGUAGE LambdaCase #-} | |
module Main where | |
import Control.Monad (forM) | |
import Text.HTML.Scalpel | |
import Data.Aeson | |
import Data.Text (Text, unpack, pack, strip) | |
import qualified Data.Text.IO as T | |
import qualified Data.ByteString.Lazy as B | |
data PreEpisode = PreEpisode | |
{ preTitle :: Text | |
, preNumber :: Int | |
, preUrl :: URL | |
, preDate :: Text | |
} deriving Show | |
data Episode = Episode | |
{ epTitle :: Text | |
, epSeason :: Int | |
, epNumber :: Int | |
, epScore :: Float | |
, epDate :: Text | |
} deriving Show | |
instance ToJSON Episode where | |
toJSON (Episode t ss n s d) = object [ pack "title" .= t | |
, pack "season" .= ss | |
, pack "number" .= n | |
, pack "score" .= s | |
, pack "date" .= show d ] | |
main :: IO () | |
main = do | |
seasons <- forM [1..9] getSeason | |
B.writeFile "output.json" (encode seasons) | |
getSeason :: Int -> IO [Episode] | |
getSeason season = do | |
contents <- T.readFile ("episodes?season=" ++ show season) | |
case scrapeStringLike contents pre_eps of | |
Nothing -> error "lolz" | |
Just pre_eps' -> forM pre_eps' $ \pre_ep' -> do | |
putStrLn $ "scrapping season " ++ show season ++ | |
", episode " ++ (show . preNumber) pre_ep' | |
scrapeURL ("http://www.imdb.com/" ++ preUrl pre_ep') score >>= \case | |
Nothing -> error $ "nou rating: " ++ show pre_ep' | |
Just score' -> return $ merge pre_ep' score' season | |
merge :: PreEpisode -> Float -> Int -> Episode | |
merge (PreEpisode t n _ d) s ss = Episode t ss n s d | |
score :: Scraper Text Float | |
score = fmap (read . unpack) . text $ "span" @: ["itemprop" @= "ratingValue"] | |
pre_eps :: Scraper Text [PreEpisode] | |
pre_eps = chroots ("div" @: ["itemprop" @= "episodes"]) pre_ep | |
pre_ep :: Scraper Text PreEpisode | |
pre_ep = PreEpisode <$> title <*> number <*> url <*> date | |
where | |
title :: Scraper Text Text | |
title = attr "title" $ "a" @: ["itemprop" @= "name"] | |
url :: Scraper Text String | |
url = fmap unpack . attr "href" $ "a" @: ["itemprop" @= "name"] | |
number :: Scraper Text Int | |
number = fmap (read . unpack) . attr "content" $ "meta" @: ["itemprop" @= "episodeNumber"] | |
date :: Scraper Text Text | |
date = fmap strip . text $ "div" @: ["class" @= "airdate"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The initial season files where dowloaded using:
Blog post: http://clrnd.com.ar/posts/2016-03-15-dr-who-imdb-ratings.html