Created
April 25, 2012 23:06
-
-
Save killerswan/2494260 to your computer and use it in GitHub Desktop.
Haskell BLOWS UP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- Kevin Cantu | |
| -- April 2012 | |
| -- Count location | |
| -- | |
| -- runtime: 16s on one million lines of CSV | |
| -- infinity secs on 3 million lines (RAM blows up) | |
| -- (ip,lat.000,lon.000) | |
| import Control.DeepSeq | |
| import Control.Monad (when) | |
| import Data.Map (Map) | |
| import Data.List | |
| import qualified Data.Map as M | |
| import System.Environment | |
| import System.IO | |
| import Text.CSV | |
| import Text.Printf | |
| -- read a CSV file | |
| -- NOTE: this is where ALL our allocation goes | |
| loadCSV path = | |
| do | |
| eitherRecs <- parseCSVFromFile path | |
| let records = case eitherRecs | |
| of Left e -> error . show $ e | |
| Right recs -> recs | |
| return records | |
| {- | |
| -- an arbitrary test filter | |
| lessThan7 xs = | |
| length xs == 4 && field1' < 7 | |
| where | |
| field1 :: String | |
| field1 = xs !! 1 | |
| field1' = read field1 :: Int | |
| -} | |
| -- our map will be keyed on the location | |
| -- experimentally: strings are faster than actual numbers, | |
| --type Location = (Rational, Rational) | |
| type Location = (String, String) | |
| -- create a location map: (lat,lon) --> count | |
| newMap :: Map Location Integer | |
| newMap = M.fromList [] | |
| -- increment a count | |
| -- NOTE: profiling indicates this gets called as many times as readCoord | |
| -- NOTE: sprinkling bangs here makes no improvement | |
| -- NOTE: deepseq (key) makes no improvement | |
| updateMap mm key = | |
| M.insertWith' (plus) key 1 mm | |
| where | |
| plus x y = x + y | |
| -- title to string | |
| showTitle = "LATITUDE,LONGITUDE,COUNT" | |
| -- record to string | |
| resultToRecord loc count acc = | |
| acc ++ [record] -- this might be slow | |
| where | |
| (lat,lon) = loc | |
| record :: Record | |
| --record = [showRationalDeg lat, showRationalDeg lon, show count] | |
| record = [lat, lon, show count] | |
| {- | |
| -- convert a rational degree into string | |
| showRationalDeg deg = | |
| showF . toF $ deg | |
| where | |
| toF rr = (fromRational rr :: Float) | |
| showF ff = printf "%0.4f" ff | |
| -} | |
| -- NOTE: profiling indicates all our time is here | |
| -- NOTE: sprinkling bangs here makes no improvement | |
| readCoord record = | |
| case length record | |
| of 3 -> (lat, lon) | |
| _ -> error $ "wrong size record: " ++ show record | |
| where | |
| (lat, lon) = (record !! 1, record !! 2) | |
| {- | |
| where | |
| --ip = read (record !! 0) :: Integer | |
| lat = read (record !! 1) :: Float | |
| lon = read (record !! 2) :: Float | |
| lat' = toRational lat :: Rational | |
| lon' = toRational lon :: Rational | |
| -} | |
| -- arg0 is the CSV file | |
| getInputFile args = | |
| if length args == 1 | |
| then args !! 0 | |
| else error "usage: count csvInputFile" | |
| -- logging | |
| verbose = True | |
| now xx = when verbose (putStrLn xx) | |
| main = | |
| do | |
| args <- getArgs | |
| now "counting IPs per coordinate..." | |
| recs <- loadCSV $ getInputFile args | |
| let coordsMap = createMapFromRecords recs | |
| now "saving/printing..." | |
| putStrLn showTitle | |
| putStrLn . printCSV . M.foldrWithKey resultToRecord [] $ coordsMap | |
| now "ok." | |
| where | |
| createMapFromRecords = | |
| myfold updateMap newMap . map readCoord . filter (\xs -> length xs == 3) | |
| --NOTE: still doesn't help | |
| --myfold fun a bs = a `deepseq` bs `deepseq` (foldl' fun a bs) | |
| myfold = foldl' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment