Created
November 10, 2018 01:31
-
-
Save sjakobi/f0e6842fb6644e3a37dc41f869f6c158 to your computer and use it in GitHub Desktop.
SplitUtf8StringSpec.hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# language BangPatterns, MagicHash, UnboxedTuples, MultiWayIf #-} | |
module Main (main, spec) where | |
import Test.Hspec | |
import Test.QuickCheck | |
import Data.ByteString | |
import qualified Data.ByteString as BS | |
import qualified Data.ByteString.Internal as BS | |
import Foreign | |
import GHC.Exts | |
import Encoding | |
import FastFunctions | |
import GHC.ForeignPtr | |
main :: IO () | |
main = hspec spec | |
spec :: Spec | |
spec = do | |
describe "utf8SplitAtByteString" $ do | |
it "concatenation" $ property $ | |
\(Utf8String bs) n -> let (a, b) = utf8SplitAtByteString n bs in a <> b === bs | |
newtype Utf8String = Utf8String ByteString | |
deriving Show | |
instance Arbitrary Utf8String where | |
arbitrary = do | |
Utf8String bs <- mkUtf8String . getUnicodeString <$> arbitrary | |
n <- arbitrary | |
pure (Utf8String (snd (utf8SplitAtByteString n bs))) | |
mkUtf8String :: String -> Utf8String | |
mkUtf8String s = | |
inlinePerformIO $ do | |
let len = utf8EncodedLength s | |
buf <- mallocForeignPtrBytes len | |
withForeignPtr buf $ \ptr -> do | |
utf8EncodeString ptr s | |
pure (Utf8String (BS.fromForeignPtr buf 0 len)) | |
-- | Split after a given number of characters. | |
-- Negative values are treated as if they are 0. | |
utf8SplitAtByteString :: Int -> ByteString -> (ByteString, ByteString) | |
utf8SplitAtByteString n0 bs@(BS.PS fptr off0 len) | |
| n0 <= 0 = (BS.empty, bs) | |
| otherwise = | |
case go n0 start of | |
ptr | ptr >= end -> (bs, BS.empty) | |
ptr -> | |
let d = ptr `minusPtr` start | |
in (BS.PS fptr off0 d, BS.PS fptr (off0 + d) (len - d)) | |
where | |
!start = unsafeForeignPtrToPtr fptr `plusPtr` off0 | |
!end = start `plusPtr` len | |
go n ptr | |
| n > 0 && ptr < end = go (pred n) (ptr `plusPtr` utf8CharSize ptr) | |
| otherwise = ptr | |
-- | Returns the size of UTF8-encoded character at the given 'Addr#'. | |
-- | |
-- The validity of the encoding is not checked. | |
{-# INLINE utf8CharSize# #-} | |
utf8CharSize# :: Addr# -> Int# | |
utf8CharSize# a# = | |
let !ch0 = indexWord8OffAddr# a# 0# in | |
if | isTrue# (ch0 `leWord#` 0x7F##) -> 1# | |
| isTrue# (ch0 `leWord#` 0xDF##) -> 2# | |
| isTrue# (ch0 `leWord#` 0xEF##) -> 3# | |
| otherwise -> 4# | |
-- | Returns the size of UTF8-encoded character beginning at the given | |
-- @'Ptr' 'Word8'@. | |
-- | |
-- The validity of the encoding is not checked. | |
utf8CharSize :: Ptr Word8 -> Int | |
utf8CharSize (Ptr a#) = I# (utf8CharSize# a#) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment