Skip to content

Instantly share code, notes, and snippets.

@michaelt
Last active August 29, 2015 14:20
Show Gist options
  • Save michaelt/cfd97dea2a255235e2fc to your computer and use it in GitHub Desktop.
Save michaelt/cfd97dea2a255235e2fc to your computer and use it in GitHub Desktop.
serialize and deserialize FastaSequences as Text
-- Parse module.
-- By G.W. Schwartz
--
{- | Collection of functions for the parsing of a fasta file. Uses the Text
type.
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Data.Fasta.Text.Parse ( parseFasta
, parseCLIPFasta
, removeNs
, removeN
, produceSequences
, sequenceHandle
, sequencesToText
, removeCLIPNs ) where
-- Built-in
import Data.Char
import Control.Monad (void)
import Data.Monoid ((<>))
import Text.Parsec
import Text.Parsec.Text
import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import qualified System.IO as IO
import Pipes
import qualified Pipes.Prelude as P
import qualified Pipes.Text as PT
import qualified Pipes.Text.Encoding as PT
import qualified Pipes.ByteString as PB
import Pipes.Group
-- Local
import Data.Fasta.Text.Types
eol :: Parsec T.Text u String
eol = choice . map (try . string) $ ["\n\r", "\r\n", "\n", "\r"]
eoe :: Parsec T.Text u ()
eoe = do
lookAhead (void $ char '>') <|> eof
fasta :: Parsec T.Text u FastaSequence
fasta = do
spaces
char '>'
header <- manyTill (satisfy (/= '>')) eol
fseq <- manyTill anyChar eoe
return (FastaSequence { fastaHeader = T.pack header
, fastaSeq = T.pack
. map toUpper
. removeWhitespace
$ fseq } )
where
removeWhitespace = filter (`notElem` ("\n\r " :: String))
fastaFile :: Parsec T.Text u [FastaSequence]
fastaFile = do
spaces
many fasta
fastaCLIP :: Parsec T.Text u (FastaSequence, [FastaSequence])
fastaCLIP = do
spaces
char '>'
germline <- fasta
clones <- many $ try fasta
return (germline, clones)
fastaCLIPFile :: Parsec T.Text u [(FastaSequence, [FastaSequence])]
fastaCLIPFile = do
spaces
many fastaCLIP
-- | Parse a standard fasta file into text sequences
parseFasta :: T.Text -> [FastaSequence]
parseFasta = eToV . parse fastaFile "error"
where
eToV (Right x) = x
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x)
-- | Parse a CLIP fasta file into text sequences
parseCLIPFasta :: T.Text -> CloneMap
parseCLIPFasta = Map.fromList
. map (\(!x, (!y, !z)) -> ((x, y), z))
. zip [0..]
. eToV
. parse fastaCLIPFile "error"
where
eToV (Right x) = x
eToV (Left x) = error ("Unable to parse fasta file\n" ++ show x)
-- | Remove Ns from a collection of sequences
removeNs :: [FastaSequence] -> [FastaSequence]
removeNs = map (\x -> x { fastaSeq = noN . fastaSeq $ x })
where
noN = T.map (\y -> if (y /= 'N' && y /= 'n') then y else '-')
-- | Remove Ns from a sequence
removeN :: FastaSequence -> FastaSequence
removeN x = x { fastaSeq = noN . fastaSeq $ x }
where
noN = T.map (\y -> if (y /= 'N' && y /= 'n') then y else '-')
-- | Remove Ns from a collection of CLIP fasta sequences
removeCLIPNs :: CloneMap -> CloneMap
removeCLIPNs = Map.fromList . map remove . Map.toList
where
remove ((!x, !y), !z) = ((x, newSeq y), map newSeq z)
newSeq !x = x { fastaSeq = noN . fastaSeq $ x }
noN = T.map (\y -> if (y /= 'N' && y /= 'n') then y else '-')
data Chunk = Header T.Text | Chunk T.Text deriving (Show, Eq, Ord)
produceSequences :: Monad m => Producer T.Text m r -> Producer FastaSequence m r
produceSequences = sequenceChunks . chunkText where
-- these composition could be optimized, since these are not at the top level
-- similarly with the reverse operation
chunkText :: Monad m => Producer T.Text m r -> Producer Chunk m r
chunkText text = concat_lines text
>-> P.map T.strip
>-> P.dropWhile (\t -> T.take 1 t /= ">")
>-> P.map parse_line
where -- we use PT.decode = view/(^.) here to avoid importing a lens library
concat_lines = folds ((<>)) T.empty id . PT.decode PT.lines
parse_line t = case T.uncons t of
Just ('>',rest) -> Header rest
_ -> Chunk t
sequenceChunks :: Monad m => Producer Chunk m r -> Producer FastaSequence m r
sequenceChunks p = do
chunk1 <- lift $ next p
case chunk1 of
Left r -> return r
Right (Header t, p') -> chunk_loop t T.empty p'
_ -> error "chunk producer does not begin with a fasta header"
where
chunk_loop h acc p = do
e <- lift (next p)
case e of
Left r -> do
yield (FastaSequence h acc)
return r
Right (chunk,p') -> case chunk of
Header t -> do yield (FastaSequence h acc)
chunk_loop t T.empty p'
Chunk t -> chunk_loop h (acc <> t) p'
sequenceHandle :: MonadIO m => IO.Handle -> Producer FastaSequence m ()
sequenceHandle = produceSequences
. void
. PT.decodeUtf8
. PB.fromHandle
sequencesToText :: Monad m => Pipe FastaSequence T.Text m r
sequencesToText = chunkSequence >-> textChunks where
-- the composition could be optimized since these are not at top level
chunkSequence :: Monad m => Pipe FastaSequence Chunk m r
chunkSequence = loop
where
loop = do
FastaSequence h t <- await
yield (Header h)
for (each (T.chunksOf 60 t))
(yield . Chunk)
loop
textChunks :: Monad m => Pipe Chunk T.Text m r
textChunks = P.map strip
where strip chunk = case chunk of
Header t -> T.snoc (T.cons '>' t) '\n'
Chunk t -> T.snoc t '\n'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment