Created
July 16, 2015 00:32
-
-
Save lambdaofgod/c6071546025c8f843ef0 to your computer and use it in GitHub Desktop.
Some bioinformatics stuff for Rosalind
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Chapter1 where | |
--import Data.Map | |
import Control.Monad | |
import Data.List | |
import Data.Char | |
data Base = A | G | C | T deriving(Eq, Show) | |
bases = [A,G,C,T] | |
type DNA = [Base] | |
complement A = T | |
complement T = A | |
complement G = C | |
complement C = G | |
charToBase 'A' = A | |
charToBase 'G' = G | |
charToBase 'C' = C | |
charToBase 'T' = T | |
stringToDna = map charToBase | |
dnaToString :: DNA -> String | |
dnaToString = foldl (\x y -> x ++ (show y)) "" | |
boolAsInt :: Bool -> Int | |
boolAsInt True = 1 | |
boolAsInt False = 0 | |
-- how many occurrences of pt are in strand? | |
patternCount :: [Base] -> [Base] -> Int | |
patternCount strand pt = | |
if length strand < length pt | |
then 0 | |
else | |
(boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt | |
kmers :: Int -> [DNA] | |
kmers 1 = map (\x -> [x]) [A,G,C,T] | |
kmers n = [b : kmer | kmer <- kmers (n-1), b <- bases] | |
kmersIn :: Int -> DNA -> [DNA] | |
kmersIn n s = | |
if (length s) < n | |
then [] | |
else | |
(take n s) : (kmersIn n (tail s)) | |
mostFrequentKmers :: Int -> DNA -> [DNA] | |
mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers | |
where | |
m = foldl max 0 (map (patternCount strand) nmers) | |
nmers = nub $ kmersIn n strand | |
kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int) | |
kmerWithIndex length i strand = ((take length strand),i) | |
kmer :: Int -> Int -> DNA -> DNA | |
kmer len i strand = fst $ kmerWithIndex len i strand | |
kmersWithIndices :: Int -> DNA -> [(DNA,Int)] | |
kmersWithIndices n = kmersWithIndicesTail n 0 | |
where | |
kmersWithIndicesTail n i strand = | |
if (length strand) < n | |
then [] | |
else | |
(kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand)) | |
reverseComplement = (map complement . reverse) | |
clumpFinding :: Int -> Int -> Int -> DNA -> [DNA] | |
clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux | |
where | |
aux = kmersWithIndices k strand | |
kmersInWindow lst = result | |
where | |
hd = head lst | |
inWindow p = (abs ((snd p) - (snd hd)) <= l) | |
result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst) | |
allFormingWindows acc [] = acc | |
allFormingWindows acc xs = | |
if t <= (length $ kmersInWindow xs) | |
then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs) | |
else | |
allFormingWindows acc (tail xs) | |
{- | |
WEEK 1 CS | |
-} | |
baseToInt A = 0 | |
baseToInt C = 1 | |
baseToInt G = 2 | |
baseToInt T = 3 | |
patternToNumber = helper . reverse | |
where | |
helper [] = 0 | |
helper (b : bs) = 4*(helper bs) + (baseToInt b) | |
{- | |
WEEK 2 | |
-} | |
skew :: DNA -> [Int] | |
skew = skewTail 0 | |
where | |
skewTail n [] = [] | |
skewTail n (C : rest) = n : (skewTail (n-1) rest) | |
skewTail n (G : rest) = n : (skewTail (n+1) rest) | |
skewTail n (ch : rest) = n : (skewTail n rest) | |
minimumSkew :: DNA -> [Int] | |
minimumSkew [] = [] | |
minimumSkew strand = minimalIndices | |
where | |
helper :: Int -> Int -> [Int] -> [Int] | |
helper i m [] = [] | |
helper i m (x : xs) = | |
if (m == x) | |
then i : (helper (i+1) m xs) | |
else | |
helper (i+1) m xs | |
m = foldr min (head $ skew strand) (skew strand) | |
minimalIndices = helper 0 m (skew strand) | |
-- these are without replication | |
comb :: Int -> [a] -> [[a]] | |
comb 0 as = [[]] | |
comb n as = | |
if length as < n | |
then [] | |
else | |
(map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as)) | |
combWithRep :: Int -> [a] -> [[a]] | |
combWithRep 0 as = [[]] | |
combWithRep 1 as = [[a] | a <- as] | |
combWithRep n as = [ a : prev | a <- as, prev <- combWithRep (n-1) as] | |
dist [] _ = 0 | |
dist _ [] = 0 | |
dist (x : xs) (y : ys) = | |
(boolAsInt (x /= y)) + dist xs ys | |
approximateOccurrences :: DNA -> Int -> DNA -> [Int] | |
approximateOccurrences pt mismatch = helper 0 [] | |
where | |
n = length pt | |
helper i acc strand = | |
if (length strand) < (length pt) | |
then acc | |
else | |
let | |
substr = take n strand | |
in | |
if ((dist substr pt) <= mismatch) | |
then helper (i + 1) (acc ++ [i]) (tail strand) | |
else | |
helper (i + 1) acc (tail strand) | |
countPatternWithMismatches :: DNA -> Int -> DNA -> Int | |
countPatternWithMismatches pt mismatch = helper 0 | |
where | |
helper i strand = | |
if (length strand) < (length pt) | |
then i | |
else | |
if ((dist strand pt) <= mismatch) | |
then helper (i + 1) (tail strand) | |
else helper i (tail strand) | |
{- | |
frequentWithMismatches :: DNA -> Int -> Int -> DNA | |
frequentWithMismatches strand length mismatches = | |
where occurring = kmers length strand | |
-- it seems that we need to find all mismatched words and then choose the which occurs the most | |
-} | |
-- helpers | |
f = clumpFinding 9 500 3 | |
clumpsInEcoli = do | |
contents <- readFile "E-coli.txt" | |
let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents | |
in | |
writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed) | |
-- write to repl | |
-- printList $ map dnaToString $ f $ stringToDna parsed | |
printList :: [String] -> IO [()] | |
printList l = sequence $ map putStrLn l | |
handleFile inName outName f = do | |
contents <- readFile inName | |
let processed = take ((length contents) - 1) contents | |
in writeFile outName $ (dnaToString . f . stringToDna) processed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment