Created
September 16, 2013 21:08
-
-
Save jbpotonnier/6586643 to your computer and use it in GitHub Desktop.
The code I wrote as an exercise to answer Zalora job offer.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE QuasiQuotes #-} | |
{-# LANGUAGE OverloadedStrings #-} | |
module Main where | |
import Data.List (intercalate) | |
import Data.Maybe (fromMaybe) | |
import Network.HTTP.Conduit (simpleHttp) | |
import Control.Concurrent.Async (mapConcurrently) | |
import qualified Codec.Compression.GZip as GZip | |
import qualified Data.ByteString.Lazy as BS | |
import qualified Data.Text.Encoding as E | |
import qualified Data.Text as T | |
import qualified Text.XML.Cursor as Cursor | |
import qualified Text.HTML.DOM as Dom | |
import qualified Text.XML.Scraping as Scraping | |
import qualified Text.XML as XML | |
import Text.XML.Selector.TH (queryT, jq) | |
type Url = String | |
type Sku = T.Text | |
data Page = Page Url XML.Document | |
data Product = Product Url (Maybe Sku) deriving Show | |
shopUrl :: Url | |
shopUrl = "http://www.zalora.sg" | |
gender :: [String] | |
gender = ["men", "women"] | |
categories :: [String] | |
categories = ["shoes", "clothing", "accessories", "bags", "sports", "beauty"] | |
allUrls :: [Url] | |
allUrls = [intercalate "/" [shopUrl, g, c] ++ "/" | g <- gender, c <- categories] | |
getPage :: Url -> IO Page | |
getPage pageUrl = do | |
content <- simpleHttp pageUrl | |
return $ Page pageUrl (Dom.parseLBS content) | |
productsFromPage :: Page -> [Product] | |
productsFromPage (Page url content) = | |
let skus = (map Scraping.eid . queryT [jq| #productsCatalog .itm |]) cursor | |
cursor = Cursor.fromDocument content in | |
zipWith Product (repeat url) skus | |
csvFromProducts :: T.Text -> [Product] -> T.Text | |
csvFromProducts separator= T.unlines . map csvFromProduct | |
where | |
csvFromProduct :: Product -> T.Text | |
csvFromProduct (Product url mSku) = T.concat [T.pack url, separator, fromMaybe "No SKU" mSku] | |
main :: IO () | |
main = do | |
pages <- mapConcurrently getPage allUrls | |
BS.writeFile "result.gz" (gzipCsvFileFor pages) | |
where | |
gzipCsvFileFor = GZip.compress . | |
BS.fromStrict . | |
E.encodeUtf8 . | |
csvFromProducts ";" . | |
concatMap (take 20 . productsFromPage) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment