Skip to content

Instantly share code, notes, and snippets.

@bacher09
Last active August 29, 2015 14:16
Show Gist options
  • Save bacher09/534da754e3ef54350cbb to your computer and use it in GitHub Desktop.
Save bacher09/534da754e3ef54350cbb to your computer and use it in GitHub Desktop.
module Utf8 where
import qualified Data.ByteString as B
import Control.Applicative
import Data.Word
import Data.Bits
import Data.Char
import Debug.Trace
utf8SeqInfo :: Word8 -> Maybe (Word8, Word8)
utf8SeqInfo c
| c < oneb = Just (0, c)
| c < twob = Nothing
| c < threb = Just (1, dropBytes threb)
| c < fourb = Just (2, dropBytes fourb)
| c < fiveb = Just (3, dropBytes fiveb)
| c < sixb = Just (4, dropBytes sixb)
| c < sevb = Just (5, dropBytes sevb)
| otherwise = Nothing
where
oneb = 0x80 -- 1000 0000
twob = 0xC0 -- 1100 0000
threb = 0xE0 -- 1110 0000
fourb = 0xF0 -- 1111 0000
fiveb = 0xF8 -- 1111 1000
sixb = 0xFC -- 1111 1100
sevb = 0xFE -- 1111 1110
dropBytes b = c .&. complement b
utf8ExtInfo :: Word8 -> Maybe Word8
utf8ExtInfo c = if c >= 0x80 && c < 0xC0
then Just $ c .&. 0x3F
else Nothing
maybeIndex :: B.ByteString -> Int -> Maybe Word8
maybeIndex s i = if B.length s > i
then Just $ s `B.index` i
else Nothing
maybeChar :: Int -> Maybe Char
maybeChar code = if code <= 0x10FFFF -- maximal char
then Just $ chr code
else Nothing
decodeUtf8Char :: B.ByteString -> Int -> Word8 -> Maybe (Char, Int)
decodeUtf8Char bs i b = do
(read_count, part) <- utf8SeqInfo b
(char_code, from) <- readUtf8Seq (fromIntegral part) read_count i
sym <- maybeChar char_code
return (sym, from)
where
readUtf8Seq :: Int -> Word8 -> Int -> Maybe (Int, Int)
readUtf8Seq p 0 from = Just (p, from)
readUtf8Seq p n from = do
next_part <- bs `maybeIndex` from >>= utf8ExtInfo
let p' = p `shiftL` 6 .|. fromIntegral next_part
readUtf8Seq p' (n - 1) (from + 1)
tryDecodeUtf8Char :: B.ByteString -> Int -> Maybe (Either (Word8, Int) (Char, Int))
tryDecodeUtf8Char bs i = do
f <- bs `maybeIndex` i
case decodeUtf8Char bs (i + 1) f of
(Just v) -> return $ Right v
Nothing -> return $ Left (f, i + 1)
decodeUtf8 :: (Word8 -> Char) -> B.ByteString -> [Char]
decodeUtf8 fun bs = loopDecode 0
where
loopDecode i = case tryDecodeUtf8Char bs i of
(Just (Right (c, i'))) -> c : loopDecode i'
(Just (Left (w, i'))) -> fun w : loopDecode i'
Nothing -> []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment