Skip to content

Instantly share code, notes, and snippets.

@jbpotonnier
Created September 16, 2013 21:08
Show Gist options
  • Save jbpotonnier/6586643 to your computer and use it in GitHub Desktop.
Save jbpotonnier/6586643 to your computer and use it in GitHub Desktop.
The code I wrote as an exercise to answer Zalora job offer.
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Data.List (intercalate)
import Data.Maybe (fromMaybe)
import Network.HTTP.Conduit (simpleHttp)
import Control.Concurrent.Async (mapConcurrently)
import qualified Codec.Compression.GZip as GZip
import qualified Data.ByteString.Lazy as BS
import qualified Data.Text.Encoding as E
import qualified Data.Text as T
import qualified Text.XML.Cursor as Cursor
import qualified Text.HTML.DOM as Dom
import qualified Text.XML.Scraping as Scraping
import qualified Text.XML as XML
import Text.XML.Selector.TH (queryT, jq)
type Url = String
type Sku = T.Text
data Page = Page Url XML.Document
data Product = Product Url (Maybe Sku) deriving Show
shopUrl :: Url
shopUrl = "http://www.zalora.sg"
gender :: [String]
gender = ["men", "women"]
categories :: [String]
categories = ["shoes", "clothing", "accessories", "bags", "sports", "beauty"]
allUrls :: [Url]
allUrls = [intercalate "/" [shopUrl, g, c] ++ "/" | g <- gender, c <- categories]
getPage :: Url -> IO Page
getPage pageUrl = do
content <- simpleHttp pageUrl
return $ Page pageUrl (Dom.parseLBS content)
productsFromPage :: Page -> [Product]
productsFromPage (Page url content) =
let skus = (map Scraping.eid . queryT [jq| #productsCatalog .itm |]) cursor
cursor = Cursor.fromDocument content in
zipWith Product (repeat url) skus
csvFromProducts :: T.Text -> [Product] -> T.Text
csvFromProducts separator= T.unlines . map csvFromProduct
where
csvFromProduct :: Product -> T.Text
csvFromProduct (Product url mSku) = T.concat [T.pack url, separator, fromMaybe "No SKU" mSku]
main :: IO ()
main = do
pages <- mapConcurrently getPage allUrls
BS.writeFile "result.gz" (gzipCsvFileFor pages)
where
gzipCsvFileFor = GZip.compress .
BS.fromStrict .
E.encodeUtf8 .
csvFromProducts ";" .
concatMap (take 20 . productsFromPage)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment