Created
February 20, 2014 20:20
-
-
Save michaelt/9122379 to your computer and use it in GitHub Desktop.
parser zoom trouble
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
{-# LANGUAGE BangPatterns#-} | |
{-# LANGUAGE RankNTypes #-} | |
import Blaze.ByteString.Builder (Builder, fromByteString, toByteString) | |
import Control.Exception (Exception) | |
import Control.Monad.Trans.Class (lift) | |
import Data.ByteString (ByteString) | |
import qualified Data.ByteString as S | |
import qualified Data.ByteString.Lazy as L | |
import Data.Monoid | |
import Data.Text (Text) | |
import qualified Data.Text as T | |
import qualified Data.Text.Encoding as TEE | |
import qualified Data.Text.Lazy as TL | |
import qualified Data.Text.Lazy.Encoding as TLE | |
import Pipes | |
import Pipes.Parse | |
import qualified Pipes.Prelude as PP | |
import qualified Pipes.ByteString as Bytes | |
import qualified Pipes.Text as Txt | |
import Pipes.Text.Encoding (utf8) | |
import Control.Lens | |
import Control.Lens.Internal.Zoom | |
import Control.Monad | |
import qualified System.IO as IO | |
import Control.Monad.Trans.Maybe | |
import Control.Monad.State.Class | |
main :: IO () | |
main = do -- S.writeFile fp $ contents 10000 -- 10000 cannot be handled fileParser0 and 1 | |
-- parse_file fileParser0 -- pathological | |
-- parse_file fileParser1 -- programs | |
parse_file fileParser2 -- good program | |
where | |
parse_file parser = IO.withBinaryFile fp IO.ReadMode $ \h -> | |
do p' <- runEffect $ parseWith parser ( Bytes.fromHandle h ) >-> PP.print | |
runEffect $ p' >-> PP.print | |
parseWith parser = loop where | |
loop p = do (m,p') <- lift (runStateT (runMaybeT parser) p) | |
case m of Nothing -> return p' | |
Just file -> do yield file | |
loop p' | |
fp = "encoded.fileformat" | |
contents n = (toByteString . mconcat . replicate n . encodeFiles) input | |
<> S.pack (replicate 10 250) | |
-- infix lens nonsense | |
infixr 1 ~~> | |
(~~>) :: Zoom m n s t | |
=> ((s -> Zoomed n c s) -> t -> Zoomed n c t) | |
-> m c -> n c | |
(~~>) = zoom | |
{-# INLINE (~~>) #-} | |
fileParser0, fileParser1, fileParser2 :: Monad m => MaybeT (StateT (Producer ByteString m x) m) File | |
fileParser0 = do (name, len) <- zoom utf8 parseText | |
contents <- zoom (Bytes.splitAt len) (lift drawAll) | |
return (File name (S.concat contents)) | |
where | |
-- this parser aggregates all Text parsing into one preliminary parser | |
-- which is then applied with `zoom utf8` | |
-- we cannot tell in advance how long, e.g. the file name will be | |
parseText :: Monad m => MaybeT (StateT (Producer Text m x) m) (Text, Int) | |
parseText = do nameLength <- parseNumber | |
names <- zoom (Txt.splitAt nameLength) $ (lift drawAll) | |
contentLength <- parseNumber | |
return $! (T.concat names, contentLength) | |
-- here we disaggregate the little Text parsers but still apply them with `zoom utf8` | |
-- this makes no difference | |
fileParser1 = do nameLength <- utf8 ~~> parseNumber | |
names <- utf8 ~~> Txt.splitAt nameLength ~~> lift drawAll | |
contentLength <- utf8 ~~> parseNumber | |
contents <- Bytes.splitAt contentLength ~~> lift drawAll | |
return (File (T.concat names) (S.concat contents)) | |
-- this is the good program; be reflecting on the fact that file names | |
-- should not be a 1000 bytes long, and binary files longer than e.g. 10 ^ 10 | |
-- we can restrict the length of the byte stream to which we apply `zoom utf8` | |
fileParser2 = do nameLength <- Bytes.splitAt 3 ~~> utf8 ~~> parseNumber | |
names <- Bytes.splitAt nameLength ~~> utf8 ~~> lift drawAll | |
len <- Bytes.splitAt 10 ~~> utf8 ~~> parseNumber | |
contents <- Bytes.splitAt len ~~> lift drawAll | |
return (File (T.concat names) (S.concat contents)) | |
parseNumber :: Monad m => MaybeT (StateT (Producer Text m x) m) Int | |
parseNumber = loop 0 where | |
loop !n = do c <- MaybeT Txt.drawChar | |
case c of ':' -> return n | |
_ -> do guard ('0' <= c && c <= '9') | |
loop $! n * 10 + (fromEnum c - fromEnum '0') | |
-- --- Michael S's `File` type and its binary encoding, etc. | |
data File = File | |
{ fileName :: !Text | |
, fileContents :: !ByteString | |
} | |
deriving Show | |
encodeFile :: File -> Builder | |
encodeFile (File name contents) = | |
tellLength (S.length bytesname) <> | |
fromByteString bytesname <> | |
tellLength (S.length contents) <> | |
fromByteString contents | |
where | |
tellLength i = fromByteString $ TEE.encodeUtf8 (T.pack (shows i ":")) | |
bytesname = TEE.encodeUtf8 name | |
encodeFiles :: [File] -> Builder | |
encodeFiles = mconcat . map encodeFile | |
input :: [File] | |
input = | |
[ File "utf8.txt" $ TEE.encodeUtf8 "This file is in UTF-8" | |
, File "utf16.txt" $ TEE.encodeUtf16LE "This file is in UTF-16" | |
, File "binary.dat" "we'll pretend to be binary" | |
] | |
--- | |
-- This desperate scheme actually has some efficacy, if used before `utf8` in a zoom | |
-- but not much | |
chunk :: Monad m => Int -> Lens' (Producer ByteString m r) (Producer ByteString m r) | |
chunk n = lens (chunkyN n) (\_ b -> b) where | |
chunkyN :: Monad m => Int -> Producer ByteString m r -> Producer ByteString m r | |
chunkyN n = prod_loop where | |
prod_loop p = do mbs <- lift $ next p | |
case mbs of Left r -> return r | |
Right (bs, p') -> do bs_loop bs | |
prod_loop p' | |
bs_loop bs = unless (S.null bs) $ do yield fore | |
unless (S.null aft) (bs_loop aft) | |
where (fore, aft) = S.splitAt n bs | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment