Created
December 12, 2018 04:21
-
-
Save gelisam/ba755361fdb34c952776e79b8bf02602 to your computer and use it in GitHub Desktop.
Averaged across persons, excluding legal fees, how much money had each person spent by time 6?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- in response to https://www.reddit.com/r/haskell/comments/a50xpr/datahaskell_solve_this_small_problem_to_fill_some/ | |
{-# LANGUAGE BangPatterns, OverloadedStrings, RecordWildCards, ScopedTypeVariables #-} | |
module Main where | |
import Control.Category ((>>>)) | |
import Data.Function ((&)) | |
import Data.Map.Strict (Map, (!)) | |
import Data.Set (Set) | |
import Test.DocTest (doctest) | |
import qualified Data.List as List | |
import qualified Data.Map.Strict as Map | |
import qualified Data.Set as Set | |
import qualified Data.Text.Lazy as Lazy | |
import qualified Data.Text as Strict | |
-- This is the input given. I have hardcoded it into the file for completeness, | |
-- but you could also obtain it from a file using 'Data.Text.Lazy.IO.readFile'. | |
-- | |
-- This looks like csv, so my first instict is to use the cassava library. | |
-- Unfortunately, due to the extra whitespace, this isn't _quite_ valid csv, so | |
-- we would need to do some pre-processing in order to transform the input into | |
-- a format which cassava can parse, and also cassava outputs a vector, which is | |
-- completely loaded into memory, not streamed. For those two reasons, I will | |
-- parse that format myself; it's just a simple comma-delimited format, after | |
-- all. | |
input :: Lazy.Text | |
input = Lazy.unlines | |
[ "item , price" | |
, "computer , 1000" | |
, "car , 5000" | |
, "legal fees (1 hour) , 400" | |
, "" | |
, "date , person , item-bought , units-bought" | |
, "7 , bob , car , 1" | |
, "5 , alice , car , 1" | |
, "4 , bob , legal fees (1 hour) , 20" | |
, "3 , alice , computer , 2" | |
, "1 , bob , computer , 1" | |
] | |
-- My goal is to parse the above tables into the following more precise types. | |
-- | |
-- Throughout this file, I will be careful to distinguish between 'Lazy.Text' | |
-- and 'Strict.Text'. I want the input to be lazy, so I can stream it, but once | |
-- an individual item name such as "computer" has been loaded, it is not useful | |
-- to be streaming the individual letters of the word "computer", it is better | |
-- to load the entire string into memory. | |
data PricingEntry = PricingEntry | |
{ item :: Strict.Text | |
, price :: Int | |
} | |
deriving Show | |
data Purchase = Purchase | |
{ date :: Int | |
, person :: Strict.Text | |
, itemBought :: Strict.Text | |
, unitsBought :: Int | |
} | |
deriving Show | |
-- | | |
-- The input contains two tables, which we will need to parse separately, so the | |
-- first step is to write a function which can split such an input into two | |
-- parts. Doing so will load the entirety of the first table into memory, which | |
-- is fine; it's the second table we want to stream. | |
-- | |
-- >>> breakAtBlankLine "hello\nworld\n\nfoo\nbar\nbaz\n\nquux\n" | |
-- ("hello\nworld\n","foo\nbar\nbaz\n\nquux\n") | |
-- | |
-- Note that I am being sloppy with error handling in this file, as I don't | |
-- specify what should happen if the input doesn't have the format we expect. In | |
-- this case, I get a pattern-matching exception if there is no blank line. | |
-- | |
-- >>> breakAtBlankLine "hello" | |
-- ...Irrefutable pattern failed for pattern (before, "" : after) | |
-- ... | |
breakAtBlankLine :: Lazy.Text -> (Lazy.Text, Lazy.Text) | |
breakAtBlankLine text = (Lazy.unlines before, Lazy.unlines after) | |
where | |
-- ["hello","world","","foo","bar","baz","","quux"] | |
allLines :: [Lazy.Text] | |
allLines = Lazy.lines text | |
-- ["hello","world"] | |
-- ["foo","bar","baz","","quux"] | |
before, after :: [Lazy.Text] | |
(before, "":after) = List.break (== "") allLines | |
-- | | |
-- Each table cell is padded with extra whitespace, which we'll want to remove. | |
-- So the next step is to write a function which can trim the whitespace from | |
-- both sides. | |
-- | |
-- My implementation compose two smaller functions, which respectively strip the | |
-- whitespace from the front and the end of the string. While in Haskell it is | |
-- more typical to compose those functions using '(.)' from right to left, I | |
-- will use '(>>>)' to compose them from left to right so the transformation | |
-- flows from top to bottom. In between each step, I will include a comment | |
-- demonstrating what the example input looks like in between those steps. | |
-- | |
-- >>> trim " foo bar " | |
-- "foo bar" | |
trim :: Strict.Text -> Strict.Text | |
trim = Strict.dropWhile (== ' ') | |
-- "foo bar " | |
>>> Strict.dropWhileEnd (== ' ') | |
-- | | |
-- Next, let's parse an individual table. | |
-- | |
-- >>> parseTable "item , price\ncomputer , 1000\n" :: [PricingEntry] | |
-- [PricingEntry {item = "computer", price = 1000}] | |
parseTable :: forall a. FromRow a | |
=> Lazy.Text -> [a] | |
parseTable = Lazy.lines | |
-- ["item , price","computer , 1000"] | |
>>> fmap (fmap trim . Strict.split (== ',') . Lazy.toStrict) | |
-- [["item","price"],["computer","1000"]] | |
>>> go | |
where | |
go :: [[Strict.Text]] -> [a] | |
go (header:data_) = data_ | |
-- [["computer","1000"]] | |
& fmap (zip header) | |
-- [[("item","computer"),("price","1000")]] | |
& fmap Map.fromList | |
-- [Map.fromList [("item","computer"),("price","1000")]] | |
& fmap parseRow | |
go [] = error "no header" | |
-- The last step, 'parseRow', somehow converted a @Map Text Text@ into a | |
-- 'PricingEntry'. I of course need to specify how to do that for both | |
-- 'PricingEntry' and 'Purchase'. | |
-- | |
-- I have written the 'Purchase' instance using the 'Applicative' style, and the | |
-- 'PricingEntry' instance more verbosely, without that style, in order to | |
-- demonstrate what that style desugars into. | |
-- | | |
-- >>> parseRow (Map.fromList [("item","computer"),("price","1000")]) :: PricingEntry | |
-- PricingEntry {item = "computer", price = 1000} | |
class FromRow a where | |
parseRow :: Map Strict.Text Strict.Text -> a | |
instance FromRow PricingEntry where | |
parseRow map_ = PricingEntry (cell "item" map_) | |
(cell "price" map_) | |
instance FromRow Purchase where | |
parseRow = Purchase <$> cell "date" | |
<*> cell "person" | |
<*> cell "item-bought" | |
<*> cell "units-bought" | |
-- | | |
-- >>> cell "price" (Map.fromList [("item","computer"),("price","1000")]) :: Int | |
-- 1000 | |
cell :: FromCell a | |
=> Strict.Text -> Map Strict.Text Strict.Text -> a | |
cell k = (! k) | |
-- "1000" | |
>>> parseCell | |
-- Different calls to 'cell' return fields of different types, so I again need | |
-- to specify how to parse each type. | |
-- | | |
-- >>> parseCell "1000" :: Int | |
-- 1000 | |
class FromCell a where | |
parseCell :: Strict.Text -> a | |
instance FromCell Strict.Text where | |
parseCell = id | |
instance FromCell Int where | |
parseCell = read . Strict.unpack | |
-- | | |
-- All right, we have now parsed our two tables into two lists. A list of | |
-- 'Purchase's is good because we can stream lists, but for 'PricingEntry', a | |
-- 'Map' from each item to its price would be more convenient. | |
-- | |
-- >>> toPriceMap [PricingEntry "computer" 1000] | |
-- fromList [("computer",1000)] | |
toPriceMap :: [PricingEntry] -> Map Strict.Text Int | |
toPriceMap = fmap (\(PricingEntry {..}) -> (item, price)) | |
-- [("computer",1000)] | |
>>> Map.fromList | |
-- The question we want to answer is "Averaged across persons, excluding legal | |
-- fees, how much money had each person spent by time 6?", so let's define a | |
-- datatype which accumulates all the data we need in order to answer this. | |
-- | |
-- I am using bang patterns because I want to make sure that as I go through the | |
-- 'Purchase's and update the 'Summary', I really update all of the fields. | |
-- Otherwise, it's easy for those fields to accidentally hold a larger and | |
-- larger thunk instead of holding e.g. a simple Int, thereby negating our | |
-- efforts to keep a constant memory while we stream the data. | |
data Summary = Summary | |
{ people :: !(Set Strict.Text) | |
, moneySpent :: !Int | |
} | |
deriving Show | |
initialSummary :: Summary | |
initialSummary = Summary mempty 0 | |
averageSpending :: Summary -> Double | |
averageSpending (Summary {..}) = fromIntegral moneySpent | |
/ max 1 (fromIntegral (length people)) | |
-- | | |
-- All right, we can now write a computation which updates our 'Summary' after | |
-- looking at a single 'Purchase': | |
-- | |
-- >>> :{ | |
-- updateSummary (Map.fromList [("computer",1000)]) | |
-- initialSummary | |
-- (Purchase 3 "alice" "computer" 2) | |
-- :} | |
-- Summary {people = fromList ["alice"], moneySpent = 2000} | |
updateSummary :: Map Strict.Text Int -> Summary -> Purchase -> Summary | |
updateSummary priceMap (Summary {..}) (Purchase {..}) | |
| date < 6 && itemBought /= "legal fees (1 hour)" | |
= Summary | |
{ people = Set.insert person people | |
, moneySpent = moneySpent + (priceMap ! itemBought) * unitsBought | |
} | |
updateSummary _ summary _ = summary | |
-- | | |
-- Tying everything all together, let's parse the two tables and stream our | |
-- partial answers as we get more data. | |
-- | |
-- First with a small input: | |
-- | |
-- >>> :{ | |
-- streamAnswers ( Lazy.unlines | |
-- [ "item , price" | |
-- , "computer , 1000" | |
-- , "" | |
-- , "date , person , item-bought , units-bought" | |
-- , "3 , alice , computer , 2" | |
-- ]) | |
-- :} | |
-- [0.0,2000.0] | |
-- | |
-- Then with the sample data: | |
-- | |
-- >>> streamAnswers input | |
-- [0.0,0.0,5000.0,5000.0,7000.0,4000.0] | |
-- | |
-- And finally, to prove that we really streaming and aren't reading the entire | |
-- second table into memory, with an input which throws an exception after we | |
-- touch the last line: | |
-- | |
-- >>> streamAnswers (input <> error "touched the last line") | |
-- [0.0,0.0,5000.0,5000.0,7000.0,4000.0*** Exception: touched the last line | |
-- ... | |
streamAnswers :: Lazy.Text -> [Double] | |
streamAnswers text = purchases | |
-- [Purchase 3 "alice" "computer" 2] | |
& List.scanl' (updateSummary priceMap) initialSummary | |
-- [initialSummary, Summary (Set.fromList ["alice"]) 2000] | |
& fmap averageSpending | |
where | |
firstPart, secondPart :: Lazy.Text | |
(firstPart, secondPart) = breakAtBlankLine text | |
priceMap :: Map Strict.Text Int | |
priceMap = toPriceMap (parseTable firstPart) | |
purchases :: [Purchase] | |
purchases = parseTable secondPart | |
-- | | |
-- Finally, if we only care about the final answer, we don't have to print the | |
-- intermediate results, we can just traverse the entirety of the second table | |
-- and then only print the final result. | |
-- | |
-- >>> finalAnswer input | |
-- 4000.0 | |
finalAnswer :: Lazy.Text -> Double | |
finalAnswer = last . streamAnswers | |
-- This last part runs all the ">>>" tests I wrote throughout this file. | |
main :: IO () | |
main = doctest ["-XOverloadedStrings", "src/Main.hs"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment