Created
December 29, 2014 17:52
-
-
Save monoid/7e5bf172a4fd7192a556 to your computer and use it in GitHub Desktop.
Variable-lenght encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import Data.Bits | |
| import qualified Data.ByteString as BS | |
| import Data.Maybe | |
| import Data.Word | |
| import Data.List.Split -- main, для профилирования | |
| {- | |
| 8-кодирование: число разбивается на группы по 7 бит, начиная с младших битов, | |
| и записываются в файл. В последней группе 8-й бит установлен в 1 как признак | |
| конца группы. | |
| Правильно это называется Variable length quantity. | |
| -} | |
| eightBytes :: Maybe Int -> Maybe (Word8, Maybe Int) | |
| eightBytes Nothing = Nothing | |
| eightBytes (Just n) = if n < 0x80 then | |
| Just (toEnum (n .|. 0x80), Nothing) | |
| else | |
| Just (toEnum (n .&. 0x7F), Just $ shiftR n 7) | |
| packEight :: Int -> BS.ByteString | |
| packEight = BS.unfoldr eightBytes . Just | |
| -- v ++ [new_a] выглядит подозрительно даже с учётом ленивости. | |
| uneightBytes :: ([Int], Int, Int) -> Word8 -> ([Int], Int, Int) | |
| uneightBytes (parsed, acc, n) b = | |
| if (b .&. 0x80) == 0 then | |
| (parsed, new_acc, n + 7) | |
| else | |
| (parsed ++ [new_acc], 0, 0) | |
| where new_acc = acc .|. ((fromEnum (b .&. 0x7F)) `shiftL` n) | |
| unpackEights :: BS.ByteString -> [Int] | |
| unpackEights bytes = values | |
| where (values, a, n) = BS.foldl uneightBytes ([], 0, 0) bytes | |
| -- TODO: if n > 0, raise error | |
| uneight :: Word8 -> ([Int], Maybe Int) -> ([Int], Maybe Int) | |
| uneight bt (parsed, acc) = | |
| if (bt .&. 0x80) == 0 then | |
| case acc of | |
| Just n -> (parsed, Just $ (n `shiftL` 7) .|. fromEnum bt) | |
| Nothing -> error "Incomplete 8bit sequence." | |
| else | |
| ((maybeToList acc) ++ parsed, Just $ fromEnum (bt .&. 0x7F)) | |
| unpackEights' :: BS.ByteString -> [Int] | |
| unpackEights' bytes = maybeToList acc ++ values | |
| where (values, acc) = BS.foldr uneight ([], Nothing) bytes | |
| unpackEights'' :: BS.ByteString -> [Int] | |
| unpackEights'' bytes = uneight'' bytes 0 0 0 $ BS.length bytes | |
| uneight'' :: BS.ByteString -> Int -> Int -> Int -> Int -> [Int] | |
| uneight'' bytes !acc !n !pos !end | |
| | (pos == end) = [] | |
| | otherwise = if (bt .&. 0x80) == 0 then | |
| -- Хвостовая рекурсия... | |
| (uneight'' bytes $! new_acc) (n+7) (pos+1) end | |
| else | |
| new_acc : uneight'' bytes 0 0 (pos + 1) end | |
| where bt = BS.index bytes pos | |
| new_acc = (acc .|. ((fromEnum (bt .&. 0x7F) `shiftL` n))) | |
| main = do | |
| let vals = map BS.concat $ chunksOf 1000 $ map packEight [0..100000000] | |
| let unvals = concatMap (take 15 . unpackEights) vals | |
| putStrLn $ show $ sum unvals |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment