Created
February 28, 2020 15:49
-
-
Save tfausak/427a0e54115e90f3543902b543e39be8 to your computer and use it in GitHub Desktop.
Memory usage for various methods of storing Haskell byte strings in PostgreSQL.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- I am working on a tool that downloads the Hackage index, stores it, and | |
-- processes it. I am storing the index in PostgreSQL. I want to keep the | |
-- maximum residency down. I don't care too much about total runtime. | |
-- | |
-- This file contains a bunch of different operations in various formats. The | |
-- idea is to get a baseline measurement for each operation (like downloading | |
-- the index or reading it from a file) and each format (streaming, lazy, or | |
-- strict). Those measurements can be used to create a minimum memory | |
-- requirement for the actual tool. | |
-- | |
-- I was motivated to perform this experiment because naively storing byte | |
-- strings (either lazy or strict) in PostgreSQL bytea columns using the | |
-- postgresql-simple library used far more memory than I expected. In | |
-- retrospect the memory usage shouldn't have been surprising because the | |
-- entire byte string is base 16 encoded and sent over the wire as ASCII. | |
-- | |
-- The high level takeaway for me is that storing byte strings in PostgreSQL | |
-- should be done with large objects. Furthermore, strict byte strings seem to | |
-- optimize surprisingly well. | |
-- | |
-- At time of writing (2020-20-28), the hackage index was 87,043,229 bytes. All | |
-- measurements were taken with GHC 8.8.2 using `+RTS -s -RTS`. | |
module Main ( main ) where | |
import Data.Function ((&)) | |
import qualified Codec.Archive.Tar as Tar | |
import qualified Codec.Compression.GZip as Gzip | |
import qualified Control.Exception as Exception | |
import qualified Control.Monad as Monad | |
import qualified Crypto.Hash as Crypto | |
import qualified Data.Bits as Bits | |
import qualified Data.ByteArray as ByteArray | |
import qualified Data.ByteString as ByteString | |
import qualified Data.ByteString.Builder as Builder | |
import qualified Data.ByteString.Lazy as LazyByteString | |
import qualified Data.IORef as IORef | |
import qualified Data.String as String | |
import qualified Database.PostgreSQL.Simple as Sql | |
import qualified Database.PostgreSQL.Simple.LargeObjects as Sql | |
import qualified Network.HTTP.Client as Client | |
import qualified Network.HTTP.Client.TLS as Tls | |
import qualified System.Environment as Environment | |
import qualified System.IO as IO | |
main :: IO () | |
main = do | |
arguments <- Environment.getArgs | |
case arguments of | |
["doNothing"] -> doNothing | |
["downloadStream"] -> downloadStream | |
["downloadLazy"] -> downloadLazy | |
["downloadStrict"] -> downloadStrict | |
["xorLazy"] -> xorLazy | |
["xorStrict"] -> xorStrict | |
["digestLazy"] -> digestLazy | |
["digestStrict"] -> digestStrict | |
["walkLazy"] -> walkLazy | |
["walkStrict"] -> walkStrict | |
["writeByteaLazy"] -> writeByteaLazy | |
["writeByteaStrict"] -> writeByteaStrict | |
["writeByteaWithDigestLazy"] -> writeByteaWithDigestLazy | |
["writeByteaWithDigestStrict"] -> writeByteaWithDigestStrict | |
["writeByteaWithSizeLazy"] -> writeByteaWithSizeLazy | |
["writeByteaWithSizeStrict"] -> writeByteaWithSizeStrict | |
["writeByteaWithDigestAndSizeLazy"] -> writeByteaWithDigestAndSizeLazy | |
["writeByteaWithDigestAndSizeStrict"] -> writeByteaWithDigestAndSizeStrict | |
["readByteaLazy"] -> readByteaLazy | |
["readByteaStrict"] -> readByteaStrict | |
["writeLargeObjectLazy"] -> writeLargeObjectLazy | |
["writeLargeObjectStrict"] -> writeLargeObjectStrict | |
["readLargeObjectLazy"] -> readLargeObjectLazy | |
["readLargeObjectStrict"] -> readLargeObjectStrict | |
["pruneLargeObjects"] -> pruneLargeObjects | |
_ -> fail $ "unexpected arguments: " <> show arguments | |
indexUrl :: String | |
indexUrl = "https://hackage.haskell.org/01-index.tar.gz" | |
indexFile :: FilePath | |
indexFile = "01-index.tar.gz" | |
withConnection :: (Sql.Connection -> IO a) -> IO a | |
withConnection = Exception.bracket (Sql.connectPostgreSQL ByteString.empty) Sql.close | |
createByteStringTable :: Sql.Connection -> IO () | |
createByteStringTable connection = do | |
Monad.void . Sql.execute_ connection $ query | |
"create table if not exists byte_string (\ | |
\id integer primary key, \ | |
\content bytea, \ | |
\digest bytea, \ | |
\size integer)" | |
Monad.void . Sql.execute_ connection $ query "truncate table byte_string" | |
createLargeObjectTable :: Sql.Connection -> IO () | |
createLargeObjectTable connection = Monad.void . Sql.execute_ connection $ query | |
"create table if not exists large_object (\ | |
\oid oid primary key, \ | |
\digest bytea, \ | |
\size integer)" | |
pruneLargeObjects :: IO () | |
pruneLargeObjects = withConnection $ \ connection -> do | |
rows <- Sql.query_ connection $ query "select oid from large_object" | |
Monad.forM_ rows $ \ row -> Sql.withTransaction connection $ do | |
Sql.loUnlink connection $ Sql.fromOnly row | |
Sql.execute connection (query "delete from large_object where oid = ?") row | |
query :: String -> Sql.Query | |
query = String.fromString | |
sha256 :: Crypto.Digest Crypto.SHA256 -> Crypto.Digest Crypto.SHA256 | |
sha256 = id | |
byteString :: ByteString.ByteString -> ByteString.ByteString | |
byteString = id | |
-- 66,552 bytes allocated in the heap | |
-- 15,800 bytes copied during GC | |
-- 48,640 bytes maximum residency (1 sample(s)) | |
-- 25,088 bytes maximum slop | |
-- 0 MB total memory in use (0 MB lost due to fragmentation) | |
doNothing :: IO () | |
doNothing = pure () | |
-- 476,856,000 bytes allocated in the heap | |
-- 22,033,864 bytes copied during GC | |
-- 3,980,736 bytes maximum residency (6 sample(s)) | |
-- 82,496 bytes maximum slop | |
-- 3 MB total memory in use (0 MB lost due to fragmentation) | |
downloadStream :: IO () | |
downloadStream = do | |
manager <- Tls.newTlsManager | |
request <- Client.parseUrlThrow indexUrl | |
handle <- IO.openBinaryFile indexFile IO.WriteMode | |
Client.withResponse request manager $ \ response -> | |
let | |
body = Client.responseBody response | |
loop = do | |
chunk <- Client.brRead body | |
Monad.unless (ByteString.null chunk) $ do | |
ByteString.hPut handle chunk | |
loop | |
in loop | |
-- 460,711,120 bytes allocated in the heap | |
-- 38,340,264 bytes copied during GC | |
-- 57,126,280 bytes maximum residency (10 sample(s)) | |
-- 13,161,080 bytes maximum slop | |
-- 54 MB total memory in use (1 MB lost due to fragmentation) | |
downloadLazy :: IO () | |
downloadLazy = do | |
manager <- Tls.newTlsManager | |
request <- Client.parseUrlThrow indexUrl | |
response <- Client.httpLbs request manager | |
response | |
& Client.responseBody | |
& LazyByteString.writeFile indexFile | |
-- 490,346,976 bytes allocated in the heap | |
-- 39,302,176 bytes copied during GC | |
-- 58,373,848 bytes maximum residency (10 sample(s)) | |
-- 13,318,440 bytes maximum slop | |
-- 55 MB total memory in use (1 MB lost due to fragmentation) | |
downloadStrict :: IO () | |
downloadStrict = do | |
manager <- Tls.newTlsManager | |
request <- Client.parseUrlThrow indexUrl | |
response <- Client.httpLbs request manager | |
response | |
& Client.responseBody | |
& LazyByteString.toStrict | |
& ByteString.writeFile indexFile | |
-- 2,918,612,168 bytes allocated in the heap | |
-- 794,016 bytes copied during GC | |
-- 66,153,704 bytes maximum residency (6 sample(s)) | |
-- 8,237,848 bytes maximum slop | |
-- 63 MB total memory in use (0 MB lost due to fragmentation) | |
xorLazy :: IO () | |
xorLazy = do | |
content <- LazyByteString.readFile indexFile | |
content | |
& LazyByteString.foldr Bits.xor 0x00 | |
-- 87,147,704 bytes allocated in the heap | |
-- 34,048 bytes copied during GC | |
-- 70,208 bytes maximum residency (1 sample(s)) | |
-- 36,288 bytes maximum slop | |
-- 0 MB total memory in use (0 MB lost due to fragmentation) | |
xorStrict :: IO () | |
xorStrict = do | |
content <- ByteString.readFile indexFile | |
content | |
& ByteString.foldr' Bits.xor 0x00 | |
-- 89,384,280 bytes allocated in the heap | |
-- 95,024 bytes copied during GC | |
-- 70,232 bytes maximum residency (2 sample(s)) | |
-- 32,168 bytes maximum slop | |
-- 0 MB total memory in use (0 MB lost due to fragmentation) | |
digestLazy :: IO () | |
digestLazy = do | |
content <- LazyByteString.readFile indexFile | |
content | |
& Crypto.hashlazy | |
& sha256 | |
-- 87,167,104 bytes allocated in the heap | |
-- 50,904 bytes copied during GC | |
-- 87,097,624 bytes maximum residency (2 sample(s)) | |
-- 1,044,200 bytes maximum slop | |
-- 83 MB total memory in use (0 MB lost due to fragmentation) | |
digestStrict :: IO () | |
digestStrict = do | |
content <- ByteString.readFile indexFile | |
content | |
& Crypto.hash | |
& sha256 | |
-- 927,177,008 bytes allocated in the heap | |
-- 481,552 bytes copied during GC | |
-- 132,736 bytes maximum residency (2 sample(s)) | |
-- 35,200 bytes maximum slop | |
-- 0 MB total memory in use (0 MB lost due to fragmentation) | |
walkLazy :: IO () | |
walkLazy = do | |
content <- LazyByteString.readFile indexFile | |
content | |
& Gzip.decompress | |
& Tar.read | |
& Tar.foldEntries (:) [] Exception.throw | |
& length | |
-- 923,881,776 bytes allocated in the heap | |
-- 369,896 bytes copied during GC | |
-- 87,135,968 bytes maximum residency (2 sample(s)) | |
-- 1,046,816 bytes maximum slop | |
-- 83 MB total memory in use (0 MB lost due to fragmentation) | |
walkStrict :: IO () | |
walkStrict = do | |
content <- ByteString.readFile indexFile | |
content | |
& LazyByteString.fromStrict | |
& Gzip.decompress | |
& Tar.read | |
& Tar.foldEntries (:) [] Exception.throw | |
& length | |
-- 524,083,312 bytes allocated in the heap | |
-- 679,928 bytes copied during GC | |
-- 348,234,184 bytes maximum residency (8 sample(s)) | |
-- 8,104,224 bytes maximum slop | |
-- 332 MB total memory in use (0 MB lost due to fragmentation) | |
writeByteaLazy :: IO () | |
writeByteaLazy = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- LazyByteString.readFile indexFile | |
Monad.void . Sql.execute connection | |
(query "insert into byte_string (id, content) values (1, ?)") | |
. Sql.Only $ Sql.Binary content | |
-- 435,433,360 bytes allocated in the heap | |
-- 73,536 bytes copied during GC | |
-- 174,115,840 bytes maximum residency (3 sample(s)) | |
-- 1,020,928 bytes maximum slop | |
-- 166 MB total memory in use (1 MB lost due to fragmentation) | |
writeByteaStrict :: IO () | |
writeByteaStrict = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- ByteString.readFile indexFile | |
Monad.void . Sql.execute connection | |
(query "insert into byte_string (id, content) values (1, ?)") | |
. Sql.Only $ Sql.Binary content | |
-- 524,959,288 bytes allocated in the heap | |
-- 913,136 bytes copied during GC | |
-- 348,234,296 bytes maximum residency (8 sample(s)) | |
-- 7,058,672 bytes maximum slop | |
-- 332 MB total memory in use (0 MB lost due to fragmentation) | |
writeByteaWithDigestLazy :: IO () | |
writeByteaWithDigestLazy = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- LazyByteString.readFile indexFile | |
Monad.void $ Sql.execute connection | |
(query "insert into byte_string (id, content, digest) values (1, ?, ?)") | |
( Sql.Binary content | |
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hashlazy content | |
) | |
-- 435,464,808 bytes allocated in the heap | |
-- 94,104 bytes copied during GC | |
-- 87,072,960 bytes maximum residency (4 sample(s)) | |
-- 1,032,000 bytes maximum slop | |
-- 83 MB total memory in use (0 MB lost due to fragmentation) | |
writeByteaWithDigestStrict :: IO () | |
writeByteaWithDigestStrict = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- ByteString.readFile indexFile | |
Monad.void $ Sql.execute connection | |
(query "insert into byte_string (id, content, digest) values (1, ?, ?)") | |
( Sql.Binary content | |
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hash content | |
) | |
-- 524,150,008 bytes allocated in the heap | |
-- 913,040 bytes copied during GC | |
-- 348,234,160 bytes maximum residency (8 sample(s)) | |
-- 7,058,664 bytes maximum slop | |
-- 332 MB total memory in use (0 MB lost due to fragmentation) | |
writeByteaWithSizeLazy :: IO () | |
writeByteaWithSizeLazy = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- LazyByteString.readFile indexFile | |
Monad.void $ Sql.execute connection | |
(query "insert into byte_string (id, content, size) values (1, ?, ?)") | |
( Sql.Binary content | |
, LazyByteString.length content | |
) | |
-- 435,461,056 bytes allocated in the heap | |
-- 74,784 bytes copied during GC | |
-- 174,115,840 bytes maximum residency (3 sample(s)) | |
-- 1,020,928 bytes maximum slop | |
-- 166 MB total memory in use (1 MB lost due to fragmentation) | |
writeByteaWithSizeStrict :: IO () | |
writeByteaWithSizeStrict = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- ByteString.readFile indexFile | |
Monad.void $ Sql.execute connection | |
(query "insert into byte_string (id, content, size) values (1, ?, ?)") | |
( Sql.Binary content | |
, ByteString.length content | |
) | |
-- 525,004,544 bytes allocated in the heap | |
-- 914,128 bytes copied during GC | |
-- 348,234,328 bytes maximum residency (8 sample(s)) | |
-- 7,058,592 bytes maximum slop | |
-- 332 MB total memory in use (0 MB lost due to fragmentation) | |
writeByteaWithDigestAndSizeLazy :: IO () | |
writeByteaWithDigestAndSizeLazy = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- LazyByteString.readFile indexFile | |
Monad.void $ Sql.execute connection | |
(query "insert into byte_string (id, content, digest, size) values (1, ?, ?, ?)") | |
( Sql.Binary content | |
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hashlazy content | |
, LazyByteString.length content | |
) | |
-- 435,476,608 bytes allocated in the heap | |
-- 94,152 bytes copied during GC | |
-- 87,073,016 bytes maximum residency (4 sample(s)) | |
-- 1,031,944 bytes maximum slop | |
-- 83 MB total memory in use (0 MB lost due to fragmentation) | |
writeByteaWithDigestAndSizeStrict :: IO () | |
writeByteaWithDigestAndSizeStrict = withConnection $ \ connection -> do | |
createByteStringTable connection | |
content <- ByteString.readFile indexFile | |
Monad.void $ Sql.execute connection | |
(query "insert into byte_string (id, content, digest, size) values (1, ?, ?, ?)") | |
( Sql.Binary content | |
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hash content | |
, ByteString.length content | |
) | |
-- 422,852,784 bytes allocated in the heap | |
-- 105,544 bytes copied during GC | |
-- 348,193,120 bytes maximum residency (4 sample(s)) | |
-- 2,027,168 bytes maximum slop | |
-- 332 MB total memory in use (0 MB lost due to fragmentation) | |
readByteaLazy :: IO () | |
readByteaLazy = withConnection $ \ connection -> do | |
[row] <- Sql.query_ connection $ query "select content from byte_string where id = 1" | |
row | |
& Sql.fromOnly | |
& Sql.fromBinary | |
& LazyByteString.maximum | |
-- 415,335,352 bytes allocated in the heap | |
-- 99,440 bytes copied during GC | |
-- 348,193,056 bytes maximum residency (3 sample(s)) | |
-- 2,027,232 bytes maximum slop | |
-- 332 MB total memory in use (0 MB lost due to fragmentation) | |
readByteaStrict :: IO () | |
readByteaStrict = withConnection $ \ connection -> do | |
[row] <- Sql.query_ connection $ query "select content from byte_string where id = 1" | |
row | |
& Sql.fromOnly | |
& Sql.fromBinary | |
& ByteString.maximum | |
-- 90,132,472 bytes allocated in the heap | |
-- 538,088 bytes copied during GC | |
-- 62,021,848 bytes maximum residency (6 sample(s)) | |
-- 7,753,512 bytes maximum slop | |
-- 59 MB total memory in use (0 MB lost due to fragmentation) | |
writeLargeObjectLazy :: IO () | |
writeLargeObjectLazy = withConnection $ \ connection -> do | |
createLargeObjectTable connection | |
content <- LazyByteString.readFile indexFile | |
Sql.withTransaction connection $ do | |
oid <- Sql.loCreat connection | |
handle <- Sql.loOpen connection oid Sql.WriteMode | |
mapM_ (Sql.loWrite connection handle) $ LazyByteString.toChunks content | |
Sql.loClose connection handle | |
Monad.void $ Sql.execute connection | |
(query "insert into large_object (oid, digest, size) values (?, ?, ?)") | |
( oid | |
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hashlazy content | |
, LazyByteString.length content | |
) | |
-- 87,348,088 bytes allocated in the heap | |
-- 73,856 bytes copied during GC | |
-- 87,104,728 bytes maximum residency (3 sample(s)) | |
-- 1,033,000 bytes maximum slop | |
-- 83 MB total memory in use (0 MB lost due to fragmentation) | |
writeLargeObjectStrict :: IO () | |
writeLargeObjectStrict = withConnection $ \ connection -> do | |
createLargeObjectTable connection | |
content <- ByteString.readFile indexFile | |
Sql.withTransaction connection $ do | |
oid <- Sql.loCreat connection | |
handle <- Sql.loOpen connection oid Sql.WriteMode | |
Monad.void $ Sql.loWrite connection handle content | |
Sql.loClose connection handle | |
Monad.void $ Sql.execute connection | |
(query "insert into large_object (oid, digest, size) values (?, ?, ?)") | |
( oid | |
, Sql.Binary . byteString . ByteArray.convert . sha256 $ Crypto.hash content | |
, ByteString.length content | |
) | |
-- 2,455,000 bytes allocated in the heap | |
-- 1,037,408 bytes copied during GC | |
-- 501,144 bytes maximum residency (2 sample(s)) | |
-- 31,200 bytes maximum slop | |
-- 0 MB total memory in use (0 MB lost due to fragmentation) | |
readLargeObjectLazy :: IO () | |
readLargeObjectLazy = withConnection $ \ connection -> Sql.withTransaction connection $ do | |
[Sql.Only oid] <- Sql.query_ connection $ query "select oid from large_object limit 1" | |
handle <- Sql.loOpen connection oid Sql.ReadMode | |
ref <- IORef.newIORef mempty | |
let | |
size = 32 * 1024 :: Int | |
loop = do | |
chunk <- Sql.loRead connection handle size | |
if ByteString.null chunk | |
then do | |
Sql.loClose connection handle | |
builder <- IORef.readIORef ref | |
builder | |
& Builder.toLazyByteString | |
& LazyByteString.maximum | |
else do | |
IORef.modifyIORef' ref $ (<> Builder.byteString chunk) | |
loop | |
loop | |
-- 161,856 bytes allocated in the heap | |
-- 33,392 bytes copied during GC | |
-- 61,504 bytes maximum residency (2 sample(s)) | |
-- 32,704 bytes maximum slop | |
-- 0 MB total memory in use (0 MB lost due to fragmentation) | |
readLargeObjectStrict :: IO () | |
readLargeObjectStrict = withConnection $ \ connection -> Sql.withTransaction connection $ do | |
[(oid, size)] <- Sql.query_ connection $ query "select oid, size from large_object limit 1" | |
handle <- Sql.loOpen connection oid Sql.ReadMode | |
content <- Sql.loRead connection handle size | |
Sql.loClose connection handle | |
content | |
& ByteString.maximum | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment