Created
July 10, 2011 05:12
-
-
Save ivant/1074297 to your computer and use it in GitHub Desktop.
wc using Data.Enumerator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.PHONY: all clean test | |
DATA_REPEATS?=1000 | |
DATA_FILE?=/usr/share/dict/words | |
GHC_OPTS?=-fspec-constr-count=64 -funfolding-use-threshold=64 | |
all: wce wci | |
clean: | |
rm -f wce wci *.hi *.o | |
test: all | |
@echo Measuring wc using Data.Enumerator: | |
@bash -c 'time (for i in `seq $(DATA_REPEATS)`; do cat $(DATA_FILE); done) | ./wce' | |
@echo | |
@echo Measuring wc using Data.Iteratee: | |
@bash -c 'time (for i in `seq $(DATA_REPEATS)`; do cat $(DATA_FILE); done) | ./wci' | |
@echo | |
@echo Measuring original wc: | |
@bash -c 'time (for i in `seq $(DATA_REPEATS)`; do cat $(DATA_FILE); done) | wc' | |
wce: WCEnumerator.hs | |
ghc $(GHC_OPTS) -o $@ -O2 --make $< | |
wci: WCIteratee.hs | |
ghc $(GHC_OPTS) -o $@ -O2 --make $< |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE ScopedTypeVariables, ViewPatterns, OverloadedStrings, BangPatterns #-} | |
import Control.Applicative | |
import Control.Exception (SomeException) | |
import Control.Monad (when, mapM) | |
import Control.Monad.Trans (lift) | |
import Data.Char (ord) | |
import Data.Enumerator hiding (map, mapM, length, filter, foldl') | |
import qualified Data.Enumerator.List as EL | |
import qualified Data.Enumerator.Binary as EB | |
import Data.List (foldl') | |
import Data.Word (Word8, Word64) | |
import qualified Data.ByteString as SB | |
import qualified Data.ByteString.Char8 as SB8 | |
import qualified Data.ByteString.Lazy as LB; import Data.ByteString.Lazy (ByteString) | |
import System.IO (stdin, hSetBinaryMode) | |
import Text.Printf (printf) | |
type Counter = Word64 -- Int is ~3% faster than Word64 | |
type CountingIteratee m = Iteratee SB.ByteString m Counter | |
countBytes :: forall m. Monad m => CountingIteratee m | |
countBytes = continue (step 0) | |
where | |
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m | |
step n _ | n `seq` False = undefined | |
step n EOF = return n | |
step n (Chunks cs) = continue (step $ n + fromIntegral (sum (map SB.length cs))) | |
countBytes' :: forall m. Monad m => CountingIteratee m | |
countBytes' = EL.fold (\(!c) bs -> c + fromIntegral (SB.length bs)) 0 | |
countWords :: forall m. Monad m => CountingIteratee m | |
--countWords = continue (step 0 True) | |
--countWords = countWords' 0 | |
countWords = fst <$> EL.fold countInChunk (0, True) | |
where | |
isSpc :: Word8 -> Bool | |
isSpc c = c `SB.elem` " \t\r\n\f\v\xa0" | |
countWords' :: Monad m => Counter -> CountingIteratee m | |
countWords' n | n `seq` False = undefined | |
countWords' n = do | |
_ <- EB.takeWhile isSpc | |
cs <- EB.takeWhile (not . isSpc) | |
if LB.null cs then | |
return n | |
else do | |
countWords' (n + 1) -- TODO is that properly optimized (do we get a stack overflow?) | |
step :: Monad m => Counter -> Bool -> Stream SB.ByteString -> CountingIteratee m | |
step n endedWithSpace _ | n `seq` endedWithSpace `seq` False = undefined | |
step n _ EOF = return n | |
step n endedWithSpace (Chunks cs) = continue (uncurry step $ countInChunks n endedWithSpace cs) | |
countInChunks :: Counter -> Bool -> [SB.ByteString] -> (Counter, Bool) | |
countInChunks (!n) (!endedWithSpace) cs = foldl' countInChunk (n,endedWithSpace) cs | |
countInChunk :: (Counter,Bool) -> SB.ByteString -> (Counter, Bool) | |
countInChunk (!n,!endedWithSpace) c = SB.foldl' (\(!n,!endedWithSpace) c -> | |
case (endedWithSpace, isSpc c) of | |
(False, True) -> (n+1, True) | |
(_, cIsSpace) -> (n, cIsSpace) | |
) (n,endedWithSpace) c | |
countLines :: forall m. Monad m => CountingIteratee m | |
--countLines = countLines' 0 | |
countLines = continue (step 0) | |
where | |
-- Proper line endings: \n, \r, \r\n | |
isEOL :: Word8 -> Bool | |
isEOL c = c == cr || c == lf | |
cr, lf :: Word8 | |
cr = fromIntegral (ord '\r') | |
lf = fromIntegral (ord '\n') | |
countLines' :: Monad m => Counter -> CountingIteratee m | |
countLines' n | n `seq` False = undefined | |
countLines' n = do | |
_ <- EB.takeWhile (not . isEOL) | |
eol <- EB.head -- consume the first EOL character | |
case eol of | |
Just c | c == cr -> do | |
c <- EB.head | |
case c of | |
Just c | c /= lf -> yield () (Chunks [SB.singleton c]) | |
_ -> return () | |
countLines' (n + 1) | |
Just _ -> countLines' (n + 1) | |
Nothing -> return n | |
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m | |
step n _ | n `seq` False = undefined | |
step n EOF = return n | |
step n (Chunks cs) = continue (step $ n + sum (map (sbCount (==lf)) cs)) | |
sbCount :: (Word8 -> Bool) -> SB.ByteString -> Counter | |
sbCount p s = SB.foldl' (\a c -> if p c then a+1 else a) 0 s | |
count :: (a -> Bool) -> [a] -> Int | |
count p xs = foldl' countOne 0 xs | |
where | |
countOne a x | a `seq` False = undefined | |
countOne a x | p x = a+1 | |
| otherwise = a | |
fanout :: forall a m r. Monad m => [Iteratee a m r] -> Iteratee a m [Either SomeException r] | |
fanout iters = do | |
steps <- mapM (lift . runIteratee) iters | |
continue (step steps) | |
where | |
step :: Monad m | |
=> [Step a m r] -- ^ list of unfinished iteratees or their results/errors | |
-> Stream a -- ^ stream piece to process | |
-> Iteratee a m [Either SomeException r] | |
step ss stream | countContinues ss == 0 = yield (map extractResult ss) stream | |
| otherwise = do | |
ss' <- mapM (stepContinue stream) ss | |
case stream of | |
EOF -> return (map extractResult ss') | |
_ -> continue (step ss') | |
countContinues :: [Step a m r] -> Int | |
countContinues = count isContinue | |
isContinue :: Step a m r -> Bool | |
isContinue (Continue _) = True | |
isContinue _ = False | |
stepContinue :: Monad m => Stream a -> Step a m r -> Iteratee a m (Step a m r) | |
stepContinue stream (Continue k) = lift $ runIteratee (k stream) | |
stepContinue _ s = return s | |
extractResult :: Step a m r -> Either SomeException r | |
extractResult (Yield r _) = Right r | |
extractResult (Error e) = Left e | |
extractResult _ = error "impossible happened" | |
main = do | |
hSetBinaryMode stdin True | |
[Right l,Right w,Right b] <- run_ $ EB.enumHandle (2^12) stdin ==<< fanout [countLines, countWords, countBytes'] | |
printf "%4d %4d %4d\n" l w b | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE ScopedTypeVariables, ViewPatterns, OverloadedStrings, BangPatterns #-} | |
import Control.Applicative | |
import Control.Exception (SomeException) | |
import Control.Monad (when, mapM) | |
import Control.Monad.Trans (lift) | |
import Data.Char (ord) | |
import Data.Iteratee as I hiding (foldl') | |
import Data.Iteratee.IO | |
import qualified Data.Iteratee.ListLike as IL | |
import Data.List (foldl') | |
import Data.Word (Word8, Word64) | |
import qualified Data.ByteString as SB | |
import qualified Data.ByteString.Char8 as SB8 | |
import qualified Data.ByteString.Lazy as LB; import Data.ByteString.Lazy (ByteString) | |
import qualified Data.ByteString.Lazy.Char8 as LB8 | |
import System.IO (stdin, hSetBinaryMode) | |
import Text.Printf (printf) | |
type Counter = Word64 -- Int is ~3% faster than Word64 | |
type CountingIteratee m = Iteratee SB.ByteString m Counter | |
countBytes :: forall m. Monad m => CountingIteratee m | |
countBytes = icont (step 0) Nothing | |
where | |
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m | |
step n s | n `seq` False = undefined | |
step n s@(EOF _) = idone n s | |
step n (Chunk c) = icont (step $ n + fromIntegral (SB.length c)) Nothing | |
{- | |
countBytes' :: forall m. Monad m => CountingIteratee m | |
countBytes' = IL.foldl' (\(!c) bs -> c + fromIntegral (SB.length bs)) 0 | |
-} | |
countWords :: forall m. Monad m => CountingIteratee m | |
--countWords = countWords' 0 | |
countWords = icont (step 0 True) Nothing | |
--countWords = fst <$> IL.foldl' countInChunk (0, True) | |
where | |
isSpc :: Word8 -> Bool | |
isSpc c = c `SB.elem` " \t\r\n\f\v\xa0" | |
-- The break function returns strict bytestrings => it concatenates them while it gets the prefix. | |
-- This appears to be more efficient (~3.3x) than using lazy bytestrings on the word data, probably | |
-- because the words/space runs are generally short and it's cheaper to concatenate than to allocate | |
-- a sequence of lazy bytestring chunks (and then traverse them). | |
-- Still, Data.Enumerator (lazy bytestring version) is ~2.2x slower than lazy bytestring version of the | |
-- Data.Iteratee. (Is it a bug?) | |
countWords' :: Monad m => Counter -> CountingIteratee m | |
countWords' n | n `seq` False = undefined | |
countWords' n = do | |
_ <- I.break (not . isSpc) | |
cs <- I.break isSpc | |
if SB.null cs then | |
return n | |
else do | |
countWords' (n + 1) -- TODO is that properly optimized (do we get a stack overflow?) | |
step :: Monad m => Counter -> Bool -> Stream SB.ByteString -> CountingIteratee m | |
step n endedWithSpace _ | n `seq` endedWithSpace `seq` False = undefined | |
step n _ s@(EOF _) = idone n s | |
step n endedWithSpace (Chunk c) = icont (uncurry step $ countInChunk (n,endedWithSpace) c) Nothing | |
countInChunk :: (Counter,Bool) -> SB.ByteString -> (Counter, Bool) | |
countInChunk (n,endedWithSpace) c = SB.foldl' (\(!n,endedWithSpace) c -> | |
case (endedWithSpace, isSpc c) of | |
(False, True) -> (n+1, True) | |
(_, cIsSpace) -> (n, cIsSpace) | |
) (n,endedWithSpace) c | |
countLines :: forall m. Monad m => CountingIteratee m | |
--countLines = countLines' 0 | |
countLines = icont (step 0) Nothing | |
where | |
-- Proper line endings: \n, \r, \r\n | |
isEOL :: Word8 -> Bool | |
isEOL c = c == cr || c == lf | |
cr, lf :: Word8 | |
cr = fromIntegral (ord '\r') | |
lf = fromIntegral (ord '\n') | |
countLines' :: Monad m => Counter -> CountingIteratee m | |
countLines' n | n `seq` False = undefined | |
countLines' n = do | |
_ <- I.break isEOL | |
eol <- I.checkErr I.head -- consume the first EOL character | |
case eol of | |
Right c | c == cr -> do | |
c <- I.checkErr I.head | |
case c of | |
Right c | c /= lf -> idone () (Chunk (SB.singleton c)) | |
_ -> return () | |
countLines' (n + 1) | |
Right _ -> countLines' (n + 1) | |
_ -> return n | |
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m | |
step n _ | n `seq` False = undefined | |
step n s@(EOF _) = idone n s | |
step n (Chunk c) = icont (step $ n + sbCount (==lf) c) Nothing | |
sbCount :: (Word8 -> Bool) -> SB.ByteString -> Counter | |
sbCount p s = SB.foldl' (\a c -> if p c then a+1 else a) 0 s | |
-- eneeCheckIfDone :: (Monad m, NullPoint elo) => ((Stream eli -> Iteratee eli m a) -> Iteratee elo m (Iteratee eli m a)) -> Enumeratee elo eli m a | |
-- liftI . step :: Monad m => (Stream eli -> Iteratee eli m a) -> Iteratee elo m (Iteratee eli m a) | |
-- liftI :: Monad m => (Stream elo -> Iteratee elo m (Iteratee eli m a)) -> Iteratee elo m (Iteratee eli m a) | |
-- step :: Monad m => Stream elo -> Iteratee elo m (Iteratee eli m a) | |
-- | |
count :: (a -> Bool) -> [a] -> Int | |
count p xs = foldl' countOne 0 xs | |
where | |
countOne a x | a `seq` False = undefined | |
countOne a x | p x = a+1 | |
| otherwise = a | |
data FanoutState a m r = FanoutIter { fanoutIter :: Iteratee a m r } | |
| FanoutErr { fanoutErr :: SomeException } | |
| FanoutRes { fanoutRes :: r } | |
fanout :: forall a m r. (Show a, Nullable a, Monad m) => [Iteratee a m r] -> Iteratee a m [Either SomeException r] | |
fanout iters = icont (step (map FanoutIter iters)) Nothing | |
where | |
step :: Monad m => [FanoutState a m r] -> Stream a -> Iteratee a m [Either SomeException r] | |
step iters stream | countIters iters == 0 = idone (map extractResult iters) stream | |
| otherwise = do | |
iters' <- mapM (stepOne stream) iters | |
case stream of | |
(EOF _) -> step iters' stream | |
_ -> icont (step iters') Nothing | |
extractResult :: FanoutState a m r -> Either SomeException r | |
extractResult (FanoutRes r) = Right r | |
extractResult (FanoutErr e) = Left e | |
extractResult _ = error "impossible happened" | |
countIters :: [FanoutState a m r] -> Int | |
countIters = count isFanoutIter | |
isFanoutIter :: FanoutState a m r -> Bool | |
isFanoutIter (FanoutIter _) = True | |
isFanoutIter _ = False | |
stepOne :: Monad m => Stream a -> FanoutState a m r -> Iteratee a m (FanoutState a m r) | |
stepOne _ f@(FanoutRes _) = return f | |
stepOne _ f@(FanoutErr _) = return f | |
stepOne stream (FanoutIter iter) = do | |
iter' <- lift $ runIter iter onDone onCont | |
case (stream, iter') of | |
(EOF _, FanoutIter iter') -> lift $ runIter iter' onDone (error "divergent iteratee") | |
_ -> return iter' | |
where | |
onDone :: Monad m => r -> Stream a -> m (FanoutState a m r) | |
onDone r _ = return $ FanoutRes r | |
onCont :: Monad m => (Stream a -> Iteratee a m r) -> Maybe SomeException -> m (FanoutState a m r) | |
onCont _ (Just err) = undefined --return $ FanoutErr err | |
onCont k Nothing = return $ FanoutIter (k stream) | |
main = do | |
hSetBinaryMode stdin True | |
[Right l,Right w,Right b] <- run $ joinIM $ enumHandle (2^12) stdin $ fanout [countLines, countWords, countBytes] | |
printf "%4d %4d %4d\n" l w b | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment