Last active
January 30, 2025 21:29
-
-
Save dino-/28b09c465c756c44b2c91d777408e166 to your computer and use it in GitHub Desktop.
A handy illustration of converting between String, Text and ByteString in Haskell
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env stack | |
-- stack --resolver lts-18.8 script | |
{-# LANGUAGE OverloadedStrings #-} | |
{- | |
This is a handy illustration of converting between five of the commonly-used | |
string types in Haskell (String, ByteString, lazy ByteString, Text and lazy | |
Text). | |
Some things to note: | |
- We are converting between String and ByteString through Text modules | |
which handles Unicode properly. It's a common (but wrong) practice to instead | |
use Data.ByteString.Char8 for these conversions, don't do that! | |
- On that note, it's possible to use Data.ByteString.UTF8 from the utf8-string | |
package for these conversions using UTF8.toString and UTF8.fromString instead | |
of Text's (encodeUtf8 . pack) and (unpack . decodeUtf8), but will require that | |
additional library | |
- It's possible you need something other than UTF-8. There are more | |
decode/encode options (like decodeUtf32LE and friends) in Data.Text.Encoding | |
-} | |
import Data.ByteString as B | |
import Data.ByteString.Lazy as BL | |
import Data.Text as T | |
import Data.Text.Encoding as T | |
import Data.Text.IO as T | |
import Data.Text.Lazy as TL | |
import Data.Text.Lazy.Encoding as TL | |
import Data.Text.Lazy.IO as TL | |
import Prelude as P | |
main :: IO () | |
main = do | |
P.putStrLn "from String" | |
B.putStrLn $ T.encodeUtf8 . T.pack $ "String to strict ByteString" | |
BL.putStrLn $ TL.encodeUtf8 . TL.pack $ "String to lazy ByteString" | |
T.putStrLn $ T.pack "String to strict Text" | |
TL.putStrLn $ TL.pack "String to lazy Text" | |
P.putStrLn "\nfrom strict ByteString" | |
P.putStrLn $ T.unpack . T.decodeUtf8 $ "strict ByteString to String" | |
BL.putStrLn $ BL.fromChunks . return $ "strict ByteString to lazy ByteString" | |
T.putStrLn $ T.decodeUtf8 "strict ByteString to strict Text" | |
TL.putStrLn $ TL.fromStrict . T.decodeUtf8 $ "strict ByteString to lazy Text" | |
P.putStrLn "\nfrom lazy ByteString" | |
P.putStrLn $ TL.unpack . TL.decodeUtf8 $ "lazy ByteString to String" | |
B.putStrLn $ B.concat . BL.toChunks $ "lazy ByteString to strict ByteString" | |
T.putStrLn $ T.decodeUtf8 . B.concat . BL.toChunks $ "lazy ByteString to strict Text" | |
TL.putStrLn $ TL.decodeUtf8 "lazy ByteString to lazy Text" | |
P.putStrLn "\nfrom strict Text" | |
P.putStrLn $ T.unpack "strict Text to String" | |
B.putStrLn $ T.encodeUtf8 "strict Text to strict ByteString" | |
BL.putStrLn $ BL.fromChunks . return . T.encodeUtf8 $ "strict Text to lazy ByteString" | |
TL.putStrLn $ TL.fromStrict "strict Text to lazy Text" | |
P.putStrLn "\nfrom lazy Text" | |
P.putStrLn $ TL.unpack "lazy Text to String" | |
B.putStrLn $ T.encodeUtf8 . TL.toStrict $ "lazy Text to strict ByteString" | |
BL.putStrLn $ TL.encodeUtf8 "lazy Text to lazy ByteString" | |
T.putStrLn $ TL.toStrict "lazy Text to strict Text" |
BL.toStrict
and BS.concat . BL.toChunks
are implemented very differently, with toStrict
doing its own memory allocation and copying. I'm not clear on how these differ in terms of efficiency or memory usage but there is a warning in the API doc about using toStrict
specifically (included below in the toStrict
source).
All that said, I've most often seen the BS.concat . BL.toChunks
method in examples but am not sure when to choose one over the other.
For reference, here's the source code of the relevant functions with the qualified package aliases changed to "BS" and "BL" for clarity.
These functions comprise BS.concat . BL.toChunks
, BL.foldrChunks
is used by BL.toChunks
-- defined in Data.ByteString.Lazy.Internal
foldrChunks :: (BS.ByteString -> a -> a) -> a -> BL.ByteString -> a
foldrChunks f z = go
where go Empty = z
go (Chunk c cs) = f c (go cs)
-- defined in Data.ByteString.Lazy
toChunks :: BL.ByteString -> [BS.ByteString]
toChunks = foldrChunks (:) []
-- defined in Data.ByteString.Internal
concat :: [BS.ByteString] -> BS.ByteString
concat = to
where
go Empty css = to css
go (Chunk c cs) css = Chunk c (go cs css)
to [] = Empty
to (cs:css) = go cs css
Whereas toStrict
is implemented with this code
-- defined in Data.ByteString.Lazy.Internal
-- |/O(n)/ Convert a lazy 'ByteString' into a strict 'ByteString'.
--
-- Note that this is an /expensive/ operation that forces the whole lazy
-- ByteString into memory and then copies all the data. If possible, try to
-- avoid converting back and forth between strict and lazy bytestrings.
--
toStrict :: BL.ByteString -> BS.ByteString
toStrict = \cs -> goLen0 cs cs
-- We pass the original [ByteString] (bss0) through as an argument through
-- goLen0, goLen1, and goLen since we will need it again in goCopy. Passing
-- it as an explicit argument avoids capturing it in these functions'
-- closures which would result in unnecessary closure allocation.
where
-- It's still possible that the result is empty
goLen0 _ Empty = BS.BS BS.nullForeignPtr 0
goLen0 cs0 (Chunk (BS.BS _ 0) cs) = goLen0 cs0 cs
goLen0 cs0 (Chunk c cs) = goLen1 cs0 c cs
-- It's still possible that the result is a single chunk
goLen1 _ bs Empty = bs
goLen1 cs0 bs (Chunk (BS.BS _ 0) cs) = goLen1 cs0 bs cs
goLen1 cs0 (BS.BS _ bl) (Chunk (BS.BS _ cl) cs) =
goLen cs0 (BS.checkedAdd "Lazy.concat" bl cl) cs
-- General case, just find the total length we'll need
goLen cs0 !total (Chunk (BS.BS _ cl) cs) =
goLen cs0 (BS.checkedAdd "Lazy.concat" total cl) cs
goLen cs0 total Empty =
BS.unsafeCreate total $ \ptr -> goCopy cs0 ptr
-- Copy the data
goCopy Empty !_ = return ()
goCopy (Chunk (BS.BS _ 0 ) cs) !ptr = goCopy cs ptr
goCopy (Chunk (BS.BS fp len) cs) !ptr =
BS.unsafeWithForeignPtr fp $ \p -> do
BS.memcpy ptr p len
goCopy cs (ptr `plusPtr` len)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
at line 53, there is a toStrict function to convert between lazy and strict Bytestrings:
https://hackage.haskell.org/package/bytestring-0.10.2.0/docs/Data-ByteString-Lazy.html#v:toStrict