Created
June 8, 2016 09:13
-
-
Save chrisdone/178c42ca6d3853d205800ba0d02c21a7 to your computer and use it in GitHub Desktop.
Fast walking over a file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE ScopedTypeVariables #-} | |
{-# LANGUAGE BangPatterns #-} | |
{- | |
- Nothing : 40,960 bytes allocated in the heap | |
- Open file and close file : 64,768 bytes allocated in the heap | |
- Walk the whole file with a buffer : 151,456 bytes allocated in the heap | |
- Counting and reporting chunks : 163,440 bytes allocated in the heap | |
- Peek each one into a Word16 : 163,448 bytes allocated in the heap | |
- Peek each unit and message count : 307,640 bytes allocated in the heap | |
Processed 4184127 units, 41898484 messages. | |
307,640 bytes allocated in the heap | |
4,912 bytes copied during GC | |
53,936 bytes maximum residency (1 sample(s)) | |
15,696 bytes maximum slop | |
1 MB total memory in use (0 MB lost due to fragmentation) | |
Tot time (elapsed) Avg pause Max pause | |
Gen 0 0 colls, 0 par 0.000s 0.000s 0.0000s 0.0000s | |
Gen 1 1 colls, 0 par 0.000s 0.002s 0.0024s 0.0024s | |
INIT time 0.000s ( 0.000s elapsed) | |
MUT time 0.077s ( 0.237s elapsed) | |
GC time 0.000s ( 0.002s elapsed) | |
EXIT time 0.000s ( 0.002s elapsed) | |
Total time 0.078s ( 0.242s elapsed) | |
%GC time 0.2% (1.0% elapsed) | |
Alloc rate 4,018,916 bytes per MUT second | |
Productivity 99.7% of total user, 32.2% of total elapsed | |
-} | |
import Foreign.Marshal.Alloc | |
import GHC.IO.Handle | |
import System.IO | |
import GHC.Ptr | |
import Foreign.Storable | |
import Data.Word | |
import Foreign.Marshal.Utils | |
import System.Environment | |
-- | Go across the file, parsing in units. | |
main :: IO () | |
main = do | |
fp:_ <- getArgs | |
h <- openFile fp ReadMode | |
buffer <- mallocBytes (bufferSize * 2) | |
(units,msgs) <- processUnits h buffer | |
putStrLn ("Processed " ++ show units ++ " units, " ++ show msgs ++ " messages.") | |
hClose h | |
-- | Make a stride in bolting across the file. | |
processUnits :: Handle -> Ptr a -> IO (Int,Int) | |
processUnits h buffer = getUnit (0 :: Int) (0 :: Int) | |
where | |
getUnit !n msgs = do | |
available <- hGetBuf h buffer bufferSize | |
if available == 0 | |
then return (n, msgs) | |
else if available < 2 | |
then error "Expected sequenced unit length!" | |
else peekUnit buffer available n msgs | |
peekUnit _ 0 n msgs = getUnit n msgs | |
peekUnit _ 1 n msgs = do | |
available <- hGetBuf h (plusPtr buffer 1) (bufferSize - 1) | |
if available < 1 | |
then error "Expected sequenced unit length!" | |
else peekUnit buffer available n msgs | |
peekUnit offset available !n !msgs = do | |
expected <- fmap fromIntegral (peek (castPtr offset) :: IO Word16) | |
if available < expected | |
then do | |
additional <- | |
hGetBuf | |
h | |
(plusPtr buffer bufferSize) | |
(min bufferSize (expected - available)) | |
if available + additional < expected | |
then return (n, msgs) -- Maybe throw an exception here, truncated input. | |
else do | |
count <- handleUnit offset | |
getUnit (n + 1) (msgs + count) | |
else do | |
count <- handleUnit offset | |
peekUnit | |
(plusPtr offset expected) | |
(available - expected) | |
(n + 1) | |
(msgs + count) | |
handleUnit :: Ptr a -> IO Int | |
handleUnit buffer = do | |
msgs <- peek (plusPtr (castPtr buffer) 2) :: IO Word8 | |
return (fromIntegral msgs) | |
-- | The size of buffer to load in from the handle. | |
bufferSize :: Int | |
bufferSize = 1024 * 1024 * 4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment