Skip to content

Instantly share code, notes, and snippets.

@sergv
Created February 13, 2018 22:53
Show Gist options
  • Save sergv/b4642943db3f59293531c5291e3e3af7 to your computer and use it in GitHub Desktop.
Save sergv/b4642943db3f59293531c5291e3e3af7 to your computer and use it in GitHub Desktop.
Benchmarks for removal of extra argument from folds in the 'bytestring' package
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
module BytestringFolds (main) where
import Control.DeepSeq
import Control.Exception
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as C8
import qualified Data.ByteString.Internal as BS
import Data.Word
import Foreign.ForeignPtr
import Foreign.Ptr
import Foreign.Storable
import qualified GHC.IO.Unsafe as Unsafe
import Gauge
{-# INLINE foldl'_no_extra_arg #-}
foldl'_no_extra_arg :: (a -> Word8 -> a) -> a -> BS.ByteString -> a
foldl'_no_extra_arg f v (BS.PS fp off len) =
BS.accursedUnutterablePerformIO $ withForeignPtr fp g
where
g ptr = go v ptr'
where
ptr' = ptr `plusPtr` off
end = ptr' `plusPtr` len
-- tail recursive; traverses array left to right
go !z !p | p == end = return z
| otherwise = do x <- peek p
go (f z x) (p `plusPtr` 1)
{-# NOINLINE bsum #-}
bsum :: BS.ByteString -> Int
bsum = BS.foldl' (\s x -> s + fromIntegral x) 0
{-# NOINLINE bsum_no_extra_arg #-}
bsum_no_extra_arg :: BS.ByteString -> Int
bsum_no_extra_arg = foldl'_no_extra_arg (\s x -> s + fromIntegral x) 0
mapAccumR_no_extra_arg :: (acc -> Word8 -> (acc, Word8)) -> acc -> BS.ByteString -> (acc, BS.ByteString)
mapAccumR_no_extra_arg f acc (BS.PS fp o len) = Unsafe.unsafeDupablePerformIO $ withForeignPtr fp $ \a -> do
gp <- BS.mallocByteString len
acc' <- withForeignPtr gp (go a)
return $! (acc', BS.PS gp 0 len)
where
go a ptr = mapAccumR_ acc (len-1)
where
p = a `plusPtr` o
q = ptr
mapAccumR_ !s !n
| n < 0 = return s
| otherwise = do
x <- peekByteOff p n
let (s', y) = f s x
pokeByteOff q n y
mapAccumR_ s' (n-1)
mapAccumR_opt_patternmatch :: (acc -> Word8 -> (acc, Word8)) -> acc -> BS.ByteString -> (acc, BS.ByteString)
mapAccumR_opt_patternmatch f acc (BS.PS fp o len) = Unsafe.unsafeDupablePerformIO $ withForeignPtr fp $ \a -> do
gp <- BS.mallocByteString len
acc' <- withForeignPtr gp (go a)
return $! (acc', BS.PS gp 0 len)
where
go a ptr = mapAccumR_ acc (len-1)
where
p = a `plusPtr` o
q = ptr
mapAccumR_ !s (-1) = return s
mapAccumR_ !s !n = do
x <- peekByteOff p n
let (s', y) = f s x
pokeByteOff q n y
mapAccumR_ s' (n-1)
sumIncrement :: BS.ByteString -> (Int, BS.ByteString)
sumIncrement = BS.mapAccumR (\s w -> (s + fromIntegral w, succ w)) 0
sumIncrement_no_extra_arg :: BS.ByteString -> (Int, BS.ByteString)
sumIncrement_no_extra_arg = mapAccumR_no_extra_arg (\s w -> (s + fromIntegral w, succ w)) 0
sumIncrement_opt_patternmatch :: BS.ByteString -> (Int, BS.ByteString)
sumIncrement_opt_patternmatch = mapAccumR_opt_patternmatch (\s w -> (s + fromIntegral w, succ w)) 0
main :: IO ()
main = do
let config = Gauge.defaultConfig
let bss :: [(Int, BS.ByteString)]
bss = [ (n, C8.pack $ concat $ replicate n "abcd0123\n45xyz_\r")
| n <- [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 65536]
]
evaluate $ rnf bss
case filter (\(_, _, b) -> b) $ map (\(n, bs) -> (n, bs, bsum bs /= bsum_no_extra_arg bs)) bss of
[] -> pure ()
xs -> error $ "Bsum implementations do not match:\n" ++ unlines (map show xs)
case filter (\(_, _, b) -> b) $ map (\(n, bs) -> (n, bs, sumIncrement bs /= sumIncrement_no_extra_arg bs || sumIncrement bs /= sumIncrement_opt_patternmatch bs)) bss of
[] -> pure ()
xs -> error $ "sumIncrement implementations do not match:\n" ++ unlines (map show xs)
defaultMainWith config
[ bgroup "bsum"
[ bgroup ("length " ++ show n)
[ bench "vanilla" $ whnf bsum bs
, bench "no_extra_arg" $ whnf bsum_no_extra_arg bs
]
| (n, bs) <- bss
]
, bgroup "sumIncrement"
[ bgroup ("length " ++ show n)
[ bench "vanilla" $ whnf sumIncrement bs
, bench "no_extra_arg" $ whnf sumIncrement_no_extra_arg bs
, bench "opt_patternmatch" $ whnf sumIncrement_opt_patternmatch bs
]
| (n, bs) <- bss
]
]
Outputs on my machine, Intel(R) Core(TM) i7-4710MQ CPU @ 2.50GHz.
bsum/length 1/vanilla mean 174.4 ns ( +- 18.22 ns )
bsum/length 1/no_extra_arg mean 164.4 ns ( +- 1.837 ns )
bsum/length 2/vanilla mean 323.1 ns ( +- 5.863 ns )
bsum/length 2/no_extra_arg mean 308.9 ns ( +- 6.802 ns )
bsum/length 4/vanilla mean 619.0 ns ( +- 5.495 ns )
bsum/length 4/no_extra_arg mean 595.6 ns ( +- 30.28 ns )
bsum/length 8/vanilla mean 1.220 μs ( +- 14.77 ns )
bsum/length 8/no_extra_arg mean 1.177 μs ( +- 15.15 ns )
bsum/length 16/vanilla mean 2.414 μs ( +- 54.11 ns )
bsum/length 16/no_extra_arg mean 2.312 μs ( +- 24.22 ns )
bsum/length 32/vanilla mean 4.769 μs ( +- 58.11 ns )
bsum/length 32/no_extra_arg mean 4.667 μs ( +- 132.6 ns )
bsum/length 64/vanilla mean 9.679 μs ( +- 677.6 ns )
bsum/length 64/no_extra_arg mean 9.487 μs ( +- 677.2 ns )
bsum/length 128/vanilla mean 18.91 μs ( +- 150.4 ns )
bsum/length 128/no_extra_arg mean 18.71 μs ( +- 431.8 ns )
bsum/length 256/vanilla mean 37.44 μs ( +- 616.0 ns )
bsum/length 256/no_extra_arg mean 38.07 μs ( +- 1.697 μs )
bsum/length 512/vanilla mean 74.22 μs ( +- 2.455 μs )
bsum/length 512/no_extra_arg mean 75.04 μs ( +- 1.409 μs )
bsum/length 1024/vanilla mean 146.9 μs ( +- 3.562 μs )
bsum/length 1024/no_extra_arg mean 150.9 μs ( +- 3.234 μs )
bsum/length 2048/vanilla mean 294.0 μs ( +- 6.154 μs )
bsum/length 2048/no_extra_arg mean 302.1 μs ( +- 6.448 μs )
bsum/length 4096/vanilla mean 593.2 μs ( +- 32.85 μs )
bsum/length 4096/no_extra_arg mean 614.9 μs ( +- 22.74 μs )
bsum/length 65536/vanilla mean 9.441 ms ( +- 326.7 μs )
bsum/length 65536/no_extra_arg mean 9.834 ms ( +- 384.2 μs )
sumIncrement/length 1/vanilla mean 166.3 ns ( +- 8.427 ns )
sumIncrement/length 1/no_extra_arg mean 49.53 ns ( +- 2.942 ns )
sumIncrement/length 1/opt_patternmatch mean 55.17 ns ( +- 2.297 ns )
sumIncrement/length 2/vanilla mean 307.5 ns ( +- 12.54 ns )
sumIncrement/length 2/no_extra_arg mean 80.41 ns ( +- 1.966 ns )
sumIncrement/length 2/opt_patternmatch mean 75.17 ns ( +- 800.0 ps )
sumIncrement/length 4/vanilla mean 575.6 ns ( +- 40.57 ns )
sumIncrement/length 4/no_extra_arg mean 123.8 ns ( +- 7.467 ns )
sumIncrement/length 4/opt_patternmatch mean 115.7 ns ( +- 6.147 ns )
sumIncrement/length 8/vanilla mean 1.134 μs ( +- 145.4 ns )
sumIncrement/length 8/no_extra_arg mean 213.0 ns ( +- 24.85 ns )
sumIncrement/length 8/opt_patternmatch mean 195.7 ns ( +- 4.029 ns )
sumIncrement/length 16/vanilla mean 2.184 μs ( +- 207.3 ns )
sumIncrement/length 16/no_extra_arg mean 381.9 ns ( +- 32.79 ns )
sumIncrement/length 16/opt_patternmatch mean 356.6 ns ( +- 21.14 ns )
sumIncrement/length 32/vanilla mean 4.239 μs ( +- 209.6 ns )
sumIncrement/length 32/no_extra_arg mean 717.4 ns ( +- 28.89 ns )
sumIncrement/length 32/opt_patternmatch mean 677.3 ns ( +- 23.22 ns )
sumIncrement/length 64/vanilla mean 8.695 μs ( +- 1.299 μs )
sumIncrement/length 64/no_extra_arg mean 1.365 μs ( +- 96.48 ns )
sumIncrement/length 64/opt_patternmatch mean 1.309 μs ( +- 45.48 ns )
sumIncrement/length 128/vanilla mean 16.64 μs ( +- 86.62 ns )
sumIncrement/length 128/no_extra_arg mean 2.654 μs ( +- 168.9 ns )
sumIncrement/length 128/opt_patternmatch mean 2.612 μs ( +- 178.4 ns )
sumIncrement/length 256/vanilla mean 33.35 μs ( +- 1.113 μs )
sumIncrement/length 256/no_extra_arg mean 5.139 μs ( +- 169.0 ns )
sumIncrement/length 256/opt_patternmatch mean 5.048 μs ( +- 12.96 ns )
sumIncrement/length 512/vanilla mean 66.51 μs ( +- 1.789 μs )
sumIncrement/length 512/no_extra_arg mean 10.22 μs ( +- 391.3 ns )
sumIncrement/length 512/opt_patternmatch mean 10.08 μs ( +- 153.1 ns )
sumIncrement/length 1024/vanilla mean 133.8 μs ( +- 4.398 μs )
sumIncrement/length 1024/no_extra_arg mean 20.46 μs ( +- 723.4 ns )
sumIncrement/length 1024/opt_patternmatch mean 20.14 μs ( +- 220.0 ns )
sumIncrement/length 2048/vanilla mean 265.9 μs ( +- 2.690 μs )
sumIncrement/length 2048/no_extra_arg mean 40.62 μs ( +- 1.195 μs )
sumIncrement/length 2048/opt_patternmatch mean 40.68 μs ( +- 2.057 μs )
sumIncrement/length 4096/vanilla mean 538.5 μs ( +- 9.456 μs )
sumIncrement/length 4096/no_extra_arg mean 81.46 μs ( +- 2.766 μs )
sumIncrement/length 4096/opt_patternmatch mean 80.31 μs ( +- 941.5 ns )
sumIncrement/length 65536/vanilla mean 8.628 ms ( +- 140.5 μs )
sumIncrement/length 65536/no_extra_arg mean 1.296 ms ( +- 74.27 μs )
sumIncrement/length 65536/opt_patternmatch mean 1.289 ms ( +- 15.95 μs )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment