Last active
August 29, 2015 14:20
-
-
Save michaelt/cfd97dea2a255235e2fc to your computer and use it in GitHub Desktop.
serialize and deserialize FastaSequences as Text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Parse module. | |
-- By G.W. Schwartz | |
-- | |
{- | Collection of functions for the parsing of a fasta file. Uses the Text | |
type. | |
-} | |
{-# LANGUAGE OverloadedStrings #-} | |
{-# LANGUAGE BangPatterns #-} | |
module Data.Fasta.Text.Parse ( parseFasta | |
, parseCLIPFasta | |
, removeNs | |
, removeN | |
, produceSequences | |
, sequenceHandle | |
, sequencesToText | |
, removeCLIPNs ) where | |
-- Built-in | |
import Data.Char | |
import Control.Monad (void) | |
import Data.Monoid ((<>)) | |
import Text.Parsec | |
import Text.Parsec.Text | |
import qualified Data.Map.Strict as Map | |
import qualified Data.Text as T | |
import qualified System.IO as IO | |
import Pipes | |
import qualified Pipes.Prelude as P | |
import qualified Pipes.Text as PT | |
import qualified Pipes.Text.Encoding as PT | |
import qualified Pipes.ByteString as PB | |
import Pipes.Group | |
-- Local | |
import Data.Fasta.Text.Types | |
eol :: Parsec T.Text u String | |
eol = choice . map (try . string) $ ["\n\r", "\r\n", "\n", "\r"] | |
eoe :: Parsec T.Text u () | |
eoe = do | |
lookAhead (void $ char '>') <|> eof | |
fasta :: Parsec T.Text u FastaSequence | |
fasta = do | |
spaces | |
char '>' | |
header <- manyTill (satisfy (/= '>')) eol | |
fseq <- manyTill anyChar eoe | |
return (FastaSequence { fastaHeader = T.pack header | |
, fastaSeq = T.pack | |
. map toUpper | |
. removeWhitespace | |
$ fseq } ) | |
where | |
removeWhitespace = filter (`notElem` ("\n\r " :: String)) | |
fastaFile :: Parsec T.Text u [FastaSequence] | |
fastaFile = do | |
spaces | |
many fasta | |
fastaCLIP :: Parsec T.Text u (FastaSequence, [FastaSequence]) | |
fastaCLIP = do | |
spaces | |
char '>' | |
germline <- fasta | |
clones <- many $ try fasta | |
return (germline, clones) | |
fastaCLIPFile :: Parsec T.Text u [(FastaSequence, [FastaSequence])] | |
fastaCLIPFile = do | |
spaces | |
many fastaCLIP | |
-- | Parse a standard fasta file into text sequences | |
parseFasta :: T.Text -> [FastaSequence] | |
parseFasta = eToV . parse fastaFile "error" | |
where | |
eToV (Right x) = x | |
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x) | |
-- | Parse a CLIP fasta file into text sequences | |
parseCLIPFasta :: T.Text -> CloneMap | |
parseCLIPFasta = Map.fromList | |
. map (\(!x, (!y, !z)) -> ((x, y), z)) | |
. zip [0..] | |
. eToV | |
. parse fastaCLIPFile "error" | |
where | |
eToV (Right x) = x | |
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x) | |
-- | Remove Ns from a collection of sequences | |
removeNs :: [FastaSequence] -> [FastaSequence] | |
removeNs = map (\x -> x { fastaSeq = noN . fastaSeq $ x }) | |
where | |
noN = T.map (\y -> if (y /= 'N' && y /= 'n') then y else '-') | |
-- | Remove Ns from a sequence | |
removeN :: FastaSequence -> FastaSequence | |
removeN x = x { fastaSeq = noN . fastaSeq $ x } | |
where | |
noN = T.map (\y -> if (y /= 'N' && y /= 'n') then y else '-') | |
-- | Remove Ns from a collection of CLIP fasta sequences | |
removeCLIPNs :: CloneMap -> CloneMap | |
removeCLIPNs = Map.fromList . map remove . Map.toList | |
where | |
remove ((!x, !y), !z) = ((x, newSeq y), map newSeq z) | |
newSeq !x = x { fastaSeq = noN . fastaSeq $ x } | |
noN = T.map (\y -> if (y /= 'N' && y /= 'n') then y else '-') | |
data Chunk = Header T.Text | Chunk T.Text deriving (Show, Eq, Ord) | |
produceSequences :: Monad m => Producer T.Text m r -> Producer FastaSequence m r | |
produceSequences = sequenceChunks . chunkText where | |
-- these composition could be optimized, since these are not at the top level | |
-- similarly with the reverse operation | |
chunkText :: Monad m => Producer T.Text m r -> Producer Chunk m r | |
chunkText text = concat_lines text | |
>-> P.map T.strip | |
>-> P.dropWhile (\t -> T.take 1 t /= ">") | |
>-> P.map parse_line | |
where -- we use PT.decode = view/(^.) here to avoid importing a lens library | |
concat_lines = folds ((<>)) T.empty id . PT.decode PT.lines | |
parse_line t = case T.uncons t of | |
Just ('>',rest) -> Header rest | |
_ -> Chunk t | |
sequenceChunks :: Monad m => Producer Chunk m r -> Producer FastaSequence m r | |
sequenceChunks p = do | |
chunk1 <- lift $ next p | |
case chunk1 of | |
Left r -> return r | |
Right (Header t, p') -> chunk_loop t T.empty p' | |
_ -> error "chunk producer does not begin with a fasta header" | |
where | |
chunk_loop h acc p = do | |
e <- lift (next p) | |
case e of | |
Left r -> do | |
yield (FastaSequence h acc) | |
return r | |
Right (chunk,p') -> case chunk of | |
Header t -> do yield (FastaSequence h acc) | |
chunk_loop t T.empty p' | |
Chunk t -> chunk_loop h (acc <> t) p' | |
sequenceHandle :: MonadIO m => IO.Handle -> Producer FastaSequence m () | |
sequenceHandle = produceSequences | |
. void | |
. PT.decodeUtf8 | |
. PB.fromHandle | |
sequencesToText :: Monad m => Pipe FastaSequence T.Text m r | |
sequencesToText = chunkSequence >-> textChunks where | |
-- the composition could be optimized since these are not at top level | |
chunkSequence :: Monad m => Pipe FastaSequence Chunk m r | |
chunkSequence = loop | |
where | |
loop = do | |
FastaSequence h t <- await | |
yield (Header h) | |
for (each (T.chunksOf 60 t)) | |
(yield . Chunk) | |
loop | |
textChunks :: Monad m => Pipe Chunk T.Text m r | |
textChunks = P.map strip | |
where strip chunk = case chunk of | |
Header t -> T.snoc (T.cons '>' t) '\n' | |
Chunk t -> T.snoc t '\n' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment