Skip to content

Instantly share code, notes, and snippets.

@tfausak
Created February 28, 2020 15:49
Show Gist options
  • Save tfausak/427a0e54115e90f3543902b543e39be8 to your computer and use it in GitHub Desktop.
Save tfausak/427a0e54115e90f3543902b543e39be8 to your computer and use it in GitHub Desktop.
Memory usage for various methods of storing Haskell byte strings in PostgreSQL.
-- I am working on a tool that downloads the Hackage index, stores it, and
-- processes it. I am storing the index in PostgreSQL. I want to keep the
-- maximum residency down. I don't care too much about total runtime.
--
-- This file contains a bunch of different operations in various formats. The
-- idea is to get a baseline measurement for each operation (like downloading
-- the index or reading it from a file) and each format (streaming, lazy, or
-- strict). Those measurements can be used to create a minimum memory
-- requirement for the actual tool.
--
-- I was motivated to perform this experiment because naively storing byte
-- strings (either lazy or strict) in PostgreSQL bytea columns using the
-- postgresql-simple library used far more memory than I expected. In
-- retrospect the memory usage shouldn't have been surprising because the
-- entire byte string is base 16 encoded and sent over the wire as ASCII.
--
-- The high level takeaway for me is that storing byte strings in PostgreSQL
-- should be done with large objects. Furthermore, strict byte strings seem to
-- optimize surprisingly well.
--
-- At time of writing (2020-20-28), the hackage index was 87,043,229 bytes. All
-- measurements were taken with GHC 8.8.2 using `+RTS -s -RTS`.
module Main ( main ) where
import Data.Function ((&))
import qualified Codec.Archive.Tar as Tar
import qualified Codec.Compression.GZip as Gzip
import qualified Control.Exception as Exception
import qualified Control.Monad as Monad
import qualified Crypto.Hash as Crypto
import qualified Data.Bits as Bits
import qualified Data.ByteArray as ByteArray
import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Builder as Builder
import qualified Data.ByteString.Lazy as LazyByteString
import qualified Data.IORef as IORef
import qualified Data.String as String
import qualified Database.PostgreSQL.Simple as Sql
import qualified Database.PostgreSQL.Simple.LargeObjects as Sql
import qualified Network.HTTP.Client as Client
import qualified Network.HTTP.Client.TLS as Tls
import qualified System.Environment as Environment
import qualified System.IO as IO
main :: IO ()
main = do
arguments <- Environment.getArgs
case arguments of
["doNothing"] -> doNothing
["downloadStream"] -> downloadStream
["downloadLazy"] -> downloadLazy
["downloadStrict"] -> downloadStrict
["xorLazy"] -> xorLazy
["xorStrict"] -> xorStrict
["digestLazy"] -> digestLazy
["digestStrict"] -> digestStrict
["walkLazy"] -> walkLazy
["walkStrict"] -> walkStrict
["writeByteaLazy"] -> writeByteaLazy
["writeByteaStrict"] -> writeByteaStrict
["writeByteaWithDigestLazy"] -> writeByteaWithDigestLazy
["writeByteaWithDigestStrict"] -> writeByteaWithDigestStrict
["writeByteaWithSizeLazy"] -> writeByteaWithSizeLazy
["writeByteaWithSizeStrict"] -> writeByteaWithSizeStrict
["writeByteaWithDigestAndSizeLazy"] -> writeByteaWithDigestAndSizeLazy
["writeByteaWithDigestAndSizeStrict"] -> writeByteaWithDigestAndSizeStrict
["readByteaLazy"] -> readByteaLazy
["readByteaStrict"] -> readByteaStrict
["writeLargeObjectLazy"] -> writeLargeObjectLazy
["writeLargeObjectStrict"] -> writeLargeObjectStrict
["readLargeObjectLazy"] -> readLargeObjectLazy
["readLargeObjectStrict"] -> readLargeObjectStrict
["pruneLargeObjects"] -> pruneLargeObjects
_ -> fail $ "unexpected arguments: " <> show arguments
indexUrl :: String
indexUrl = "https://hackage.haskell.org/01-index.tar.gz"
indexFile :: FilePath
indexFile = "01-index.tar.gz"
withConnection :: (Sql.Connection -> IO a) -> IO a
withConnection = Exception.bracket (Sql.connectPostgreSQL ByteString.empty) Sql.close
createByteStringTable :: Sql.Connection -> IO ()
createByteStringTable connection = do
Monad.void . Sql.execute_ connection $ query
"create table if not exists byte_string (\
\id integer primary key, \
\content bytea, \
\digest bytea, \
\size integer)"
Monad.void . Sql.execute_ connection $ query "truncate table byte_string"
createLargeObjectTable :: Sql.Connection -> IO ()
createLargeObjectTable connection = Monad.void . Sql.execute_ connection $ query
"create table if not exists large_object (\
\oid oid primary key, \
\digest bytea, \
\size integer)"
pruneLargeObjects :: IO ()
pruneLargeObjects = withConnection $ \ connection -> do
rows <- Sql.query_ connection $ query "select oid from large_object"
Monad.forM_ rows $ \ row -> Sql.withTransaction connection $ do
Sql.loUnlink connection $ Sql.fromOnly row
Sql.execute connection (query "delete from large_object where oid = ?") row
query :: String -> Sql.Query
query = String.fromString
sha256 :: Crypto.Digest Crypto.SHA256 -> Crypto.Digest Crypto.SHA256
sha256 = id
byteString :: ByteString.ByteString -> ByteString.ByteString
byteString = id
-- 66,552 bytes allocated in the heap
-- 15,800 bytes copied during GC
-- 48,640 bytes maximum residency (1 sample(s))
-- 25,088 bytes maximum slop
-- 0 MB total memory in use (0 MB lost due to fragmentation)
doNothing :: IO ()
doNothing = pure ()
-- 476,856,000 bytes allocated in the heap
-- 22,033,864 bytes copied during GC
-- 3,980,736 bytes maximum residency (6 sample(s))
-- 82,496 bytes maximum slop
-- 3 MB total memory in use (0 MB lost due to fragmentation)
downloadStream :: IO ()
downloadStream = do
manager <- Tls.newTlsManager
request <- Client.parseUrlThrow indexUrl
handle <- IO.openBinaryFile indexFile IO.WriteMode
Client.withResponse request manager $ \ response ->
let
body = Client.responseBody response
loop = do
chunk <- Client.brRead body
Monad.unless (ByteString.null chunk) $ do
ByteString.hPut handle chunk
loop
in loop
-- 460,711,120 bytes allocated in the heap
-- 38,340,264 bytes copied during GC
-- 57,126,280 bytes maximum residency (10 sample(s))
-- 13,161,080 bytes maximum slop
-- 54 MB total memory in use (1 MB lost due to fragmentation)
downloadLazy :: IO ()
downloadLazy = do
manager <- Tls.newTlsManager
request <- Client.parseUrlThrow indexUrl
response <- Client.httpLbs request manager
response
& Client.responseBody
& LazyByteString.writeFile indexFile
-- 490,346,976 bytes allocated in the heap
-- 39,302,176 bytes copied during GC
-- 58,373,848 bytes maximum residency (10 sample(s))
-- 13,318,440 bytes maximum slop
-- 55 MB total memory in use (1 MB lost due to fragmentation)
downloadStrict :: IO ()
downloadStrict = do
manager <- Tls.newTlsManager
request <- Client.parseUrlThrow indexUrl
response <- Client.httpLbs request manager
response
& Client.responseBody
& LazyByteString.toStrict
& ByteString.writeFile indexFile
-- 2,918,612,168 bytes allocated in the heap
-- 794,016 bytes copied during GC
-- 66,153,704 bytes maximum residency (6 sample(s))
-- 8,237,848 bytes maximum slop
-- 63 MB total memory in use (0 MB lost due to fragmentation)
xorLazy :: IO ()
xorLazy = do
content <- LazyByteString.readFile indexFile
content
& LazyByteString.foldr Bits.xor 0x00
& print
-- 87,147,704 bytes allocated in the heap
-- 34,048 bytes copied during GC
-- 70,208 bytes maximum residency (1 sample(s))
-- 36,288 bytes maximum slop
-- 0 MB total memory in use (0 MB lost due to fragmentation)
xorStrict :: IO ()
xorStrict = do
content <- ByteString.readFile indexFile
content
& ByteString.foldr' Bits.xor 0x00
& print
-- 89,384,280 bytes allocated in the heap
-- 95,024 bytes copied during GC
-- 70,232 bytes maximum residency (2 sample(s))
-- 32,168 bytes maximum slop
-- 0 MB total memory in use (0 MB lost due to fragmentation)
digestLazy :: IO ()
digestLazy = do
content <- LazyByteString.readFile indexFile
content
& Crypto.hashlazy
& sha256
& print
-- 87,167,104 bytes allocated in the heap
-- 50,904 bytes copied during GC
-- 87,097,624 bytes maximum residency (2 sample(s))
-- 1,044,200 bytes maximum slop
-- 83 MB total memory in use (0 MB lost due to fragmentation)
digestStrict :: IO ()
digestStrict = do
content <- ByteString.readFile indexFile
content
& Crypto.hash
& sha256
& print
-- 927,177,008 bytes allocated in the heap
-- 481,552 bytes copied during GC
-- 132,736 bytes maximum residency (2 sample(s))
-- 35,200 bytes maximum slop
-- 0 MB total memory in use (0 MB lost due to fragmentation)
walkLazy :: IO ()
walkLazy = do
content <- LazyByteString.readFile indexFile
content
& Gzip.decompress
& Tar.read
& Tar.foldEntries (:) [] Exception.throw
& length
& print
-- 923,881,776 bytes allocated in the heap
-- 369,896 bytes copied during GC
-- 87,135,968 bytes maximum residency (2 sample(s))
-- 1,046,816 bytes maximum slop
-- 83 MB total memory in use (0 MB lost due to fragmentation)
walkStrict :: IO ()
walkStrict = do
content <- ByteString.readFile indexFile
content
& LazyByteString.fromStrict
& Gzip.decompress
& Tar.read
& Tar.foldEntries (:) [] Exception.throw
& length
& print
-- 524,083,312 bytes allocated in the heap
-- 679,928 bytes copied during GC
-- 348,234,184 bytes maximum residency (8 sample(s))
-- 8,104,224 bytes maximum slop
-- 332 MB total memory in use (0 MB lost due to fragmentation)
writeByteaLazy :: IO ()
writeByteaLazy = withConnection $ \ connection -> do
createByteStringTable connection
content <- LazyByteString.readFile indexFile
Monad.void . Sql.execute connection
(query "insert into byte_string (id, content) values (1, ?)")
. Sql.Only $ Sql.Binary content
-- 435,433,360 bytes allocated in the heap
-- 73,536 bytes copied during GC
-- 174,115,840 bytes maximum residency (3 sample(s))
-- 1,020,928 bytes maximum slop
-- 166 MB total memory in use (1 MB lost due to fragmentation)
writeByteaStrict :: IO ()
writeByteaStrict = withConnection $ \ connection -> do
createByteStringTable connection
content <- ByteString.readFile indexFile
Monad.void . Sql.execute connection
(query "insert into byte_string (id, content) values (1, ?)")
. Sql.Only $ Sql.Binary content
-- 524,959,288 bytes allocated in the heap
-- 913,136 bytes copied during GC
-- 348,234,296 bytes maximum residency (8 sample(s))
-- 7,058,672 bytes maximum slop
-- 332 MB total memory in use (0 MB lost due to fragmentation)
writeByteaWithDigestLazy :: IO ()
writeByteaWithDigestLazy = withConnection $ \ connection -> do
createByteStringTable connection
content <- LazyByteString.readFile indexFile
Monad.void $ Sql.execute connection
(query "insert into byte_string (id, content, digest) values (1, ?, ?)")
( Sql.Binary content
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hashlazy content
)
-- 435,464,808 bytes allocated in the heap
-- 94,104 bytes copied during GC
-- 87,072,960 bytes maximum residency (4 sample(s))
-- 1,032,000 bytes maximum slop
-- 83 MB total memory in use (0 MB lost due to fragmentation)
writeByteaWithDigestStrict :: IO ()
writeByteaWithDigestStrict = withConnection $ \ connection -> do
createByteStringTable connection
content <- ByteString.readFile indexFile
Monad.void $ Sql.execute connection
(query "insert into byte_string (id, content, digest) values (1, ?, ?)")
( Sql.Binary content
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hash content
)
-- 524,150,008 bytes allocated in the heap
-- 913,040 bytes copied during GC
-- 348,234,160 bytes maximum residency (8 sample(s))
-- 7,058,664 bytes maximum slop
-- 332 MB total memory in use (0 MB lost due to fragmentation)
writeByteaWithSizeLazy :: IO ()
writeByteaWithSizeLazy = withConnection $ \ connection -> do
createByteStringTable connection
content <- LazyByteString.readFile indexFile
Monad.void $ Sql.execute connection
(query "insert into byte_string (id, content, size) values (1, ?, ?)")
( Sql.Binary content
, LazyByteString.length content
)
-- 435,461,056 bytes allocated in the heap
-- 74,784 bytes copied during GC
-- 174,115,840 bytes maximum residency (3 sample(s))
-- 1,020,928 bytes maximum slop
-- 166 MB total memory in use (1 MB lost due to fragmentation)
writeByteaWithSizeStrict :: IO ()
writeByteaWithSizeStrict = withConnection $ \ connection -> do
createByteStringTable connection
content <- ByteString.readFile indexFile
Monad.void $ Sql.execute connection
(query "insert into byte_string (id, content, size) values (1, ?, ?)")
( Sql.Binary content
, ByteString.length content
)
-- 525,004,544 bytes allocated in the heap
-- 914,128 bytes copied during GC
-- 348,234,328 bytes maximum residency (8 sample(s))
-- 7,058,592 bytes maximum slop
-- 332 MB total memory in use (0 MB lost due to fragmentation)
writeByteaWithDigestAndSizeLazy :: IO ()
writeByteaWithDigestAndSizeLazy = withConnection $ \ connection -> do
createByteStringTable connection
content <- LazyByteString.readFile indexFile
Monad.void $ Sql.execute connection
(query "insert into byte_string (id, content, digest, size) values (1, ?, ?, ?)")
( Sql.Binary content
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hashlazy content
, LazyByteString.length content
)
-- 435,476,608 bytes allocated in the heap
-- 94,152 bytes copied during GC
-- 87,073,016 bytes maximum residency (4 sample(s))
-- 1,031,944 bytes maximum slop
-- 83 MB total memory in use (0 MB lost due to fragmentation)
writeByteaWithDigestAndSizeStrict :: IO ()
writeByteaWithDigestAndSizeStrict = withConnection $ \ connection -> do
createByteStringTable connection
content <- ByteString.readFile indexFile
Monad.void $ Sql.execute connection
(query "insert into byte_string (id, content, digest, size) values (1, ?, ?, ?)")
( Sql.Binary content
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hash content
, ByteString.length content
)
-- 422,852,784 bytes allocated in the heap
-- 105,544 bytes copied during GC
-- 348,193,120 bytes maximum residency (4 sample(s))
-- 2,027,168 bytes maximum slop
-- 332 MB total memory in use (0 MB lost due to fragmentation)
readByteaLazy :: IO ()
readByteaLazy = withConnection $ \ connection -> do
[row] <- Sql.query_ connection $ query "select content from byte_string where id = 1"
row
& Sql.fromOnly
& Sql.fromBinary
& LazyByteString.maximum
& print
-- 415,335,352 bytes allocated in the heap
-- 99,440 bytes copied during GC
-- 348,193,056 bytes maximum residency (3 sample(s))
-- 2,027,232 bytes maximum slop
-- 332 MB total memory in use (0 MB lost due to fragmentation)
readByteaStrict :: IO ()
readByteaStrict = withConnection $ \ connection -> do
[row] <- Sql.query_ connection $ query "select content from byte_string where id = 1"
row
& Sql.fromOnly
& Sql.fromBinary
& ByteString.maximum
& print
-- 90,132,472 bytes allocated in the heap
-- 538,088 bytes copied during GC
-- 62,021,848 bytes maximum residency (6 sample(s))
-- 7,753,512 bytes maximum slop
-- 59 MB total memory in use (0 MB lost due to fragmentation)
writeLargeObjectLazy :: IO ()
writeLargeObjectLazy = withConnection $ \ connection -> do
createLargeObjectTable connection
content <- LazyByteString.readFile indexFile
Sql.withTransaction connection $ do
oid <- Sql.loCreat connection
handle <- Sql.loOpen connection oid Sql.WriteMode
mapM_ (Sql.loWrite connection handle) $ LazyByteString.toChunks content
Sql.loClose connection handle
Monad.void $ Sql.execute connection
(query "insert into large_object (oid, digest, size) values (?, ?, ?)")
( oid
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hashlazy content
, LazyByteString.length content
)
-- 87,348,088 bytes allocated in the heap
-- 73,856 bytes copied during GC
-- 87,104,728 bytes maximum residency (3 sample(s))
-- 1,033,000 bytes maximum slop
-- 83 MB total memory in use (0 MB lost due to fragmentation)
writeLargeObjectStrict :: IO ()
writeLargeObjectStrict = withConnection $ \ connection -> do
createLargeObjectTable connection
content <- ByteString.readFile indexFile
Sql.withTransaction connection $ do
oid <- Sql.loCreat connection
handle <- Sql.loOpen connection oid Sql.WriteMode
Monad.void $ Sql.loWrite connection handle content
Sql.loClose connection handle
Monad.void $ Sql.execute connection
(query "insert into large_object (oid, digest, size) values (?, ?, ?)")
( oid
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hash content
, ByteString.length content
)
-- 2,455,000 bytes allocated in the heap
-- 1,037,408 bytes copied during GC
-- 501,144 bytes maximum residency (2 sample(s))
-- 31,200 bytes maximum slop
-- 0 MB total memory in use (0 MB lost due to fragmentation)
readLargeObjectLazy :: IO ()
readLargeObjectLazy = withConnection $ \ connection -> Sql.withTransaction connection $ do
[Sql.Only oid] <- Sql.query_ connection $ query "select oid from large_object limit 1"
handle <- Sql.loOpen connection oid Sql.ReadMode
ref <- IORef.newIORef mempty
let
size = 32 * 1024 :: Int
loop = do
chunk <- Sql.loRead connection handle size
if ByteString.null chunk
then do
Sql.loClose connection handle
builder <- IORef.readIORef ref
builder
& Builder.toLazyByteString
& LazyByteString.maximum
& print
else do
IORef.modifyIORef' ref $ (<> Builder.byteString chunk)
loop
loop
-- 161,856 bytes allocated in the heap
-- 33,392 bytes copied during GC
-- 61,504 bytes maximum residency (2 sample(s))
-- 32,704 bytes maximum slop
-- 0 MB total memory in use (0 MB lost due to fragmentation)
readLargeObjectStrict :: IO ()
readLargeObjectStrict = withConnection $ \ connection -> Sql.withTransaction connection $ do
[(oid, size)] <- Sql.query_ connection $ query "select oid, size from large_object limit 1"
handle <- Sql.loOpen connection oid Sql.ReadMode
content <- Sql.loRead connection handle size
Sql.loClose connection handle
content
& ByteString.maximum
& print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment