Created
September 20, 2012 14:17
-
-
Save snoyberg/3756212 to your computer and use it in GitHub Desktop.
Lower-casing a ByteString
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings, MagicHash, UnboxedTuples, BangPatterns #-} | |
import Test.Hspec (hspec) | |
import Test.Hspec.QuickCheck (prop) | |
import Criterion.Main (defaultMain, bench, whnf) | |
import qualified Data.ByteString as S | |
import qualified Data.ByteString.Char8 as S8 | |
import Data.ByteString.Unsafe (unsafePackAddressLen, unsafeUseAsCStringLen) | |
import qualified Data.Char | |
import qualified Data.Word8 | |
import System.IO.Unsafe (unsafePerformIO) | |
import Foreign.Storable (peekByteOff) | |
import Foreign.Ptr (Ptr) | |
import Foreign.C.String (CStringLen) | |
import GHC.Prim | |
( leWord#, Word#, plusWord#, State#, RealWorld, Addr# | |
, writeWord8Array#, (==#), byteArrayContents#, unsafeFreezeByteArray# | |
, newPinnedByteArray# | |
) | |
import GHC.Base (Int (..)) | |
import GHC.Word (Word (..), Word8 (..)) | |
import GHC.Types (IO (..)) | |
main :: IO () | |
main = do | |
hspec $ do | |
prop "char8 == word8" $ \s -> | |
let bs = S.pack s | |
in S8.map Data.Char.toLower bs == S.map Data.Word8.toLower bs | |
prop "char8 == bsToLower" $ \s -> | |
let bs = S.pack s | |
in S8.map Data.Char.toLower bs == bsToLower bs | |
prop "char8 == map toLowerC" $ \s -> | |
let bs = S.pack s | |
in S8.map Data.Char.toLower bs == S8.map toLowerC bs | |
input <- S.readFile "bench.hs" | |
defaultMain | |
[ bench "Char8" $ whnf (S.length . S8.map Data.Char.toLower) input | |
, bench "Char8 toLowerC" $ whnf (S.length . S8.map toLowerC) input | |
, bench "Word8" $ whnf (S.length . S.map Data.Word8.toLower) input | |
, bench "bsToLower" $ whnf (S.length . bsToLower) input | |
] | |
bsToLower :: S.ByteString -> S.ByteString | |
bsToLower bs | |
| S.null bs = S.empty | |
| otherwise = unsafePerformIO $ unsafeUseAsCStringLen bs cstrlenToLower | |
cstrlenToLower :: CStringLen -> IO S.ByteString | |
cstrlenToLower (input, !len'@(I# len)) = IO $ \state0 -> | |
case newPinnedByteArray# len state0 of | |
(# state1, mba #) -> | |
case loop mba 0 state1 of | |
state2 -> | |
case unsafeFreezeByteArray# mba state2 of | |
(# state3, ba #) -> | |
unsafePackAddressLen' len' (byteArrayContents# ba) state3 | |
where | |
loop mba offset@(I# offset') stateA | |
| offset' ==# len = stateA | |
| otherwise = | |
case peek' input offset stateA of | |
(# stateB, W8# w #) -> | |
case writeWord8Array# mba offset' (toLowerU w) stateB of | |
stateC -> loop mba (offset + 1) stateC | |
unsafePackAddressLen' :: Int | |
-> Addr# | |
-> State# RealWorld | |
-> (# State# RealWorld, S.ByteString #) | |
unsafePackAddressLen' i a = | |
f | |
where | |
IO f = unsafePackAddressLen i a | |
peek' :: Ptr b -> Int -> State# RealWorld -> (# State# RealWorld, Word8 #) | |
peek' ptr offset = | |
f | |
where | |
IO f = peekByteOff ptr offset | |
toLowerU :: Word# -> Word# | |
toLowerU w | |
| isUpperU w = w `plusWord#` space | |
| otherwise = w | |
where | |
!(W8# space) = 0x20 | |
isUpperU :: Word# -> Bool | |
isUpperU w = | |
(l1 `leWord#` w && w `leWord#` h1) || | |
(l2 `leWord#` w && w `leWord#` h2) || | |
(l3 `leWord#` w && w `leWord#` h3) | |
where | |
!(W# l1) = 0x41 | |
!(W# h1) = 0x5a | |
!(W# l2) = 0xc0 | |
!(W# h2) = 0xd6 | |
!(W# l3) = 0xd8 | |
!(W# h3) = 0xde | |
toLowerC :: Char -> Char | |
toLowerC w | |
| isUpperC w = toEnum $ fromEnum w + 0x20 | |
| otherwise = w | |
isUpperC :: Char -> Bool | |
isUpperC w = | |
(l1 <= w && w <= h1) || | |
(l2 <= w && w <= h2) || | |
(l3 <= w && w <= h3) | |
where | |
l1 = '\x41' | |
h1 = '\x5a' | |
l2 = '\xc0' | |
h2 = '\xd6' | |
l3 = '\xd8' | |
h3 = '\xde' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment