Last active
August 29, 2015 14:16
-
-
Save bacher09/534da754e3ef54350cbb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Utf8 where | |
import qualified Data.ByteString as B | |
import Control.Applicative | |
import Data.Word | |
import Data.Bits | |
import Data.Char | |
import Debug.Trace | |
utf8SeqInfo :: Word8 -> Maybe (Word8, Word8) | |
utf8SeqInfo c | |
| c < oneb = Just (0, c) | |
| c < twob = Nothing | |
| c < threb = Just (1, dropBytes threb) | |
| c < fourb = Just (2, dropBytes fourb) | |
| c < fiveb = Just (3, dropBytes fiveb) | |
| c < sixb = Just (4, dropBytes sixb) | |
| c < sevb = Just (5, dropBytes sevb) | |
| otherwise = Nothing | |
where | |
oneb = 0x80 -- 1000 0000 | |
twob = 0xC0 -- 1100 0000 | |
threb = 0xE0 -- 1110 0000 | |
fourb = 0xF0 -- 1111 0000 | |
fiveb = 0xF8 -- 1111 1000 | |
sixb = 0xFC -- 1111 1100 | |
sevb = 0xFE -- 1111 1110 | |
dropBytes b = c .&. complement b | |
utf8ExtInfo :: Word8 -> Maybe Word8 | |
utf8ExtInfo c = if c >= 0x80 && c < 0xC0 | |
then Just $ c .&. 0x3F | |
else Nothing | |
maybeIndex :: B.ByteString -> Int -> Maybe Word8 | |
maybeIndex s i = if B.length s > i | |
then Just $ s `B.index` i | |
else Nothing | |
maybeChar :: Int -> Maybe Char | |
maybeChar code = if code <= 0x10FFFF -- maximal char | |
then Just $ chr code | |
else Nothing | |
decodeUtf8Char :: B.ByteString -> Int -> Word8 -> Maybe (Char, Int) | |
decodeUtf8Char bs i b = do | |
(read_count, part) <- utf8SeqInfo b | |
(char_code, from) <- readUtf8Seq (fromIntegral part) read_count i | |
sym <- maybeChar char_code | |
return (sym, from) | |
where | |
readUtf8Seq :: Int -> Word8 -> Int -> Maybe (Int, Int) | |
readUtf8Seq p 0 from = Just (p, from) | |
readUtf8Seq p n from = do | |
next_part <- bs `maybeIndex` from >>= utf8ExtInfo | |
let p' = p `shiftL` 6 .|. fromIntegral next_part | |
readUtf8Seq p' (n - 1) (from + 1) | |
tryDecodeUtf8Char :: B.ByteString -> Int -> Maybe (Either (Word8, Int) (Char, Int)) | |
tryDecodeUtf8Char bs i = do | |
f <- bs `maybeIndex` i | |
case decodeUtf8Char bs (i + 1) f of | |
(Just v) -> return $ Right v | |
Nothing -> return $ Left (f, i + 1) | |
decodeUtf8 :: (Word8 -> Char) -> B.ByteString -> [Char] | |
decodeUtf8 fun bs = loopDecode 0 | |
where | |
loopDecode i = case tryDecodeUtf8Char bs i of | |
(Just (Right (c, i'))) -> c : loopDecode i' | |
(Just (Left (w, i'))) -> fun w : loopDecode i' | |
Nothing -> [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment