-
-
Save aculich/1074302 to your computer and use it in GitHub Desktop.
wc using Data.Enumerator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE ScopedTypeVariables, ViewPatterns, OverloadedStrings #-} | |
import Control.Applicative | |
import Control.Exception (SomeException) | |
import Control.Monad (when, mapM) | |
import Control.Monad.Trans (lift) | |
import Data.Char (ord) | |
import Data.Enumerator hiding (map, mapM, length, filter, foldl') | |
import Data.List (foldl') | |
import Data.Word (Word8, Word64) | |
import qualified Data.ByteString as SB | |
import qualified Data.ByteString.Char8 as SB8 | |
import qualified Data.ByteString.Lazy as LB; import Data.ByteString.Lazy (ByteString) | |
import qualified Data.Enumerator.Binary as EB | |
import System.IO (stdin) | |
import Text.Printf (printf) | |
type Counter = Int -- Int is ~3% faster | |
type CountingIteratee m = Iteratee SB.ByteString m Counter | |
countCharacters :: forall m. Monad m => CountingIteratee m | |
countCharacters = continue (step 0) | |
where | |
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m | |
step n _ | n `seq` False = undefined | |
step n EOF = return n | |
step n (Chunks cs) = continue (step $ n + fromIntegral (sum (map SB.length cs))) | |
countWords :: forall m. Monad m => CountingIteratee m | |
countWords = countWords' 0 | |
where | |
isSpc :: Word8 -> Bool | |
isSpc c = c `SB.elem` " \t\r\n\f\v\xa0" | |
countWords' :: Monad m => Counter -> CountingIteratee m | |
countWords' n | n `seq` False = undefined | |
countWords' n = do | |
_ <- EB.takeWhile isSpc | |
cs <- EB.takeWhile (not . isSpc) | |
if LB.null cs then | |
return n | |
else do | |
countWords' (n + 1) -- TODO is that properly optimized (do we get a stack overflow?) | |
countLines :: forall m. Monad m => CountingIteratee m | |
countLines = countLines' 0 | |
where | |
-- Proper line endings: \n, \r, \r\n | |
isEOL :: Word8 -> Bool | |
isEOL c = c == cr || c == lf | |
cr, lf :: Word8 | |
cr = fromIntegral (ord '\r') | |
lf = fromIntegral (ord '\n') | |
countLines' :: Monad m => Counter -> CountingIteratee m | |
countLines' n | n `seq` False = undefined | |
countLines' n = do | |
eof <- isEOF | |
if eof then | |
return n | |
else do | |
_ <- EB.takeWhile (not . isEOL) | |
eol <- EB.head -- consume the first EOL character | |
when (eol == Just cr) $ do | |
c <- EB.head | |
case c of | |
Just c | c == lf -> yield () (Chunks [SB.singleton c]) | |
_ -> return () | |
countLines' (n + 1) | |
count :: (a -> Bool) -> [a] -> Int | |
count p xs = foldl' countOne 0 xs | |
where | |
countOne a x | a `seq` False = undefined | |
countOne a x | p x = a+1 | |
| otherwise = a | |
fanout :: forall a m r. Monad m => [Iteratee a m r] -> Iteratee a m [Either SomeException r] | |
fanout iters = do | |
steps <- mapM (lift . runIteratee) iters | |
continue (step steps) | |
where | |
step :: Monad m | |
=> [Step a m r] -- ^ list of unfinished iteratees or their results/errors | |
-> Stream a -- ^ stream piece to process | |
-> Iteratee a m [Either SomeException r] | |
step ss stream | countContinues ss == 0 = yield (map extractResult ss) stream | |
| otherwise = do | |
ss' <- mapM (stepContinue stream) ss | |
case stream of | |
EOF -> return (map extractResult ss') | |
_ -> continue (step ss') | |
countContinues :: [Step a m r] -> Int | |
countContinues = count isContinue | |
isContinue :: Step a m r -> Bool | |
isContinue (Continue _) = True | |
isContinue _ = False | |
stepContinue :: Monad m => Stream a -> Step a m r -> Iteratee a m (Step a m r) | |
stepContinue stream (Continue k) = lift $ runIteratee (k stream) | |
stepContinue _ s = return s | |
extractResult :: Step a m r -> Either SomeException r | |
extractResult (Yield r _) = Right r | |
extractResult (Error e) = Left e | |
extractResult _ = error "impossible happened" | |
main = do | |
[Right l,Right w,Right c] <- run_ $ EB.enumHandle (65536*4) stdin ==<< fanout [countLines, countWords, countCharacters] | |
printf "%4d %4d %4d\n" l w c |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment