-
-
Save gregorycollins/3756876 to your computer and use it in GitHub Desktop.
Lower-casing a ByteString
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings, MagicHash, UnboxedTuples, BangPatterns #-} | |
import Control.Monad (replicateM_) | |
import Criterion.Main (defaultMain, bench, whnfIO) | |
import qualified Data.ByteString as S | |
import qualified Data.ByteString.Char8 as S8 | |
import Data.ByteString.Unsafe (unsafePackAddressLen, unsafeUseAsCStringLen) | |
import qualified Data.Char | |
import Data.Char (ord) | |
import qualified Data.Word8 | |
import System.IO.Unsafe (unsafePerformIO) | |
import Foreign.Storable (peekByteOff) | |
import Foreign.Ptr (Ptr) | |
import Foreign.C.String (CStringLen) | |
import GHC.Prim | |
( leWord#, Word#, plusWord#, State#, RealWorld, Addr# | |
, writeWord8Array#, (==#), byteArrayContents#, unsafeFreezeByteArray# | |
, newPinnedByteArray# | |
) | |
import GHC.Base (Int (..), unsafeChr) | |
import GHC.Word (Word (..), Word8 (..)) | |
import GHC.Types (IO (..)) | |
main :: IO () | |
main = do | |
input <- S.readFile "bench.hs" | |
defaultMain | |
[ bench "Word8" $ b (S.length . S.map Data.Word8.toLower) input | |
, bench "Word8-local" $ b (S.length . S.map toLower8) input | |
, bench "Char8 toLowerC" $ b (S.length . S8.map toLowerC) input | |
] | |
where | |
b f i = whnfIO $ replicateM_ 1000 $ do | |
!_ <- return $! f i | |
return $! () | |
toLower8 :: Word8 -> Word8 | |
toLower8 w | |
| isUpper8 w = w + 32 | |
| otherwise = w | |
{-# INLINE toLower8 #-} | |
isUpper8 :: Word8 -> Bool | |
isUpper8 w = 0x41 <= w && w <= 0x5a | |
|| 0xc0 <= w && w <= 0xd6 | |
|| 0xd8 <= w && w <= 0xde | |
{-# INLINE isUpper8 #-} | |
bsToLower :: S.ByteString -> S.ByteString | |
bsToLower bs | |
| S.null bs = S.empty | |
| otherwise = unsafePerformIO $ unsafeUseAsCStringLen bs cstrlenToLower | |
cstrlenToLower :: CStringLen -> IO S.ByteString | |
cstrlenToLower (input, !len'@(I# len)) = IO $ \state0 -> | |
case newPinnedByteArray# len state0 of | |
(# state1, mba #) -> | |
case loop mba 0 state1 of | |
state2 -> | |
case unsafeFreezeByteArray# mba state2 of | |
(# state3, ba #) -> | |
unsafePackAddressLen' len' (byteArrayContents# ba) state3 | |
where | |
loop mba offset@(I# offset') stateA | |
| offset' ==# len = stateA | |
| otherwise = | |
case peek' input offset stateA of | |
(# stateB, W8# w #) -> | |
case writeWord8Array# mba offset' (toLowerU w) stateB of | |
stateC -> loop mba (offset + 1) stateC | |
unsafePackAddressLen' :: Int | |
-> Addr# | |
-> State# RealWorld | |
-> (# State# RealWorld, S.ByteString #) | |
unsafePackAddressLen' i a = | |
f | |
where | |
IO f = unsafePackAddressLen i a | |
peek' :: Ptr b -> Int -> State# RealWorld -> (# State# RealWorld, Word8 #) | |
peek' ptr offset = | |
f | |
where | |
IO f = peekByteOff ptr offset | |
toLowerU :: Word# -> Word# | |
toLowerU w | |
| isUpperU w = w `plusWord#` space | |
| otherwise = w | |
where | |
!(W8# space) = 0x20 | |
isUpperU :: Word# -> Bool | |
isUpperU w = | |
(l1 `leWord#` w && w `leWord#` h1) || | |
(l2 `leWord#` w && w `leWord#` h2) || | |
(l3 `leWord#` w && w `leWord#` h3) | |
where | |
!(W# l1) = 0x41 | |
!(W# h1) = 0x5a | |
!(W# l2) = 0xc0 | |
!(W# h2) = 0xd6 | |
!(W# l3) = 0xd8 | |
!(W# h3) = 0xde | |
toLowerC :: Char -> Char | |
toLowerC w | |
| isUpperC w = unsafeChr $ ord w + 0x20 | |
| otherwise = w | |
isUpperC :: Char -> Bool | |
isUpperC w = | |
(l1 <= w && w <= h1) || | |
(l2 <= w && w <= h2) || | |
(l3 <= w && w <= h3) | |
where | |
l1 = '\x41' | |
h1 = '\x5a' | |
l2 = '\xc0' | |
h2 = '\xd6' | |
l3 = '\xd8' | |
h3 = '\xde' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment