Skip to content

Instantly share code, notes, and snippets.

@lambdaofgod
Created July 16, 2015 00:32
Show Gist options
  • Save lambdaofgod/c6071546025c8f843ef0 to your computer and use it in GitHub Desktop.
Save lambdaofgod/c6071546025c8f843ef0 to your computer and use it in GitHub Desktop.
Some bioinformatics stuff for Rosalind
module Chapter1 where
--import Data.Map
import Control.Monad
import Data.List
import Data.Char
data Base = A | G | C | T deriving(Eq, Show)
bases = [A,G,C,T]
type DNA = [Base]
complement A = T
complement T = A
complement G = C
complement C = G
charToBase 'A' = A
charToBase 'G' = G
charToBase 'C' = C
charToBase 'T' = T
stringToDna = map charToBase
dnaToString :: DNA -> String
dnaToString = foldl (\x y -> x ++ (show y)) ""
boolAsInt :: Bool -> Int
boolAsInt True = 1
boolAsInt False = 0
-- how many occurrences of pt are in strand?
patternCount :: [Base] -> [Base] -> Int
patternCount strand pt =
if length strand < length pt
then 0
else
(boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt
kmers :: Int -> [DNA]
kmers 1 = map (\x -> [x]) [A,G,C,T]
kmers n = [b : kmer | kmer <- kmers (n-1), b <- bases]
kmersIn :: Int -> DNA -> [DNA]
kmersIn n s =
if (length s) < n
then []
else
(take n s) : (kmersIn n (tail s))
mostFrequentKmers :: Int -> DNA -> [DNA]
mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers
where
m = foldl max 0 (map (patternCount strand) nmers)
nmers = nub $ kmersIn n strand
kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int)
kmerWithIndex length i strand = ((take length strand),i)
kmer :: Int -> Int -> DNA -> DNA
kmer len i strand = fst $ kmerWithIndex len i strand
kmersWithIndices :: Int -> DNA -> [(DNA,Int)]
kmersWithIndices n = kmersWithIndicesTail n 0
where
kmersWithIndicesTail n i strand =
if (length strand) < n
then []
else
(kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand))
reverseComplement = (map complement . reverse)
clumpFinding :: Int -> Int -> Int -> DNA -> [DNA]
clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux
where
aux = kmersWithIndices k strand
kmersInWindow lst = result
where
hd = head lst
inWindow p = (abs ((snd p) - (snd hd)) <= l)
result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst)
allFormingWindows acc [] = acc
allFormingWindows acc xs =
if t <= (length $ kmersInWindow xs)
then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs)
else
allFormingWindows acc (tail xs)
{-
WEEK 1 CS
-}
baseToInt A = 0
baseToInt C = 1
baseToInt G = 2
baseToInt T = 3
patternToNumber = helper . reverse
where
helper [] = 0
helper (b : bs) = 4*(helper bs) + (baseToInt b)
{-
WEEK 2
-}
skew :: DNA -> [Int]
skew = skewTail 0
where
skewTail n [] = []
skewTail n (C : rest) = n : (skewTail (n-1) rest)
skewTail n (G : rest) = n : (skewTail (n+1) rest)
skewTail n (ch : rest) = n : (skewTail n rest)
minimumSkew :: DNA -> [Int]
minimumSkew [] = []
minimumSkew strand = minimalIndices
where
helper :: Int -> Int -> [Int] -> [Int]
helper i m [] = []
helper i m (x : xs) =
if (m == x)
then i : (helper (i+1) m xs)
else
helper (i+1) m xs
m = foldr min (head $ skew strand) (skew strand)
minimalIndices = helper 0 m (skew strand)
-- these are without replication
comb :: Int -> [a] -> [[a]]
comb 0 as = [[]]
comb n as =
if length as < n
then []
else
(map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as))
combWithRep :: Int -> [a] -> [[a]]
combWithRep 0 as = [[]]
combWithRep 1 as = [[a] | a <- as]
combWithRep n as = [ a : prev | a <- as, prev <- combWithRep (n-1) as]
dist [] _ = 0
dist _ [] = 0
dist (x : xs) (y : ys) =
(boolAsInt (x /= y)) + dist xs ys
approximateOccurrences :: DNA -> Int -> DNA -> [Int]
approximateOccurrences pt mismatch = helper 0 []
where
n = length pt
helper i acc strand =
if (length strand) < (length pt)
then acc
else
let
substr = take n strand
in
if ((dist substr pt) <= mismatch)
then helper (i + 1) (acc ++ [i]) (tail strand)
else
helper (i + 1) acc (tail strand)
countPatternWithMismatches :: DNA -> Int -> DNA -> Int
countPatternWithMismatches pt mismatch = helper 0
where
helper i strand =
if (length strand) < (length pt)
then i
else
if ((dist strand pt) <= mismatch)
then helper (i + 1) (tail strand)
else helper i (tail strand)
{-
frequentWithMismatches :: DNA -> Int -> Int -> DNA
frequentWithMismatches strand length mismatches =
where occurring = kmers length strand
-- it seems that we need to find all mismatched words and then choose the which occurs the most
-}
-- helpers
f = clumpFinding 9 500 3
clumpsInEcoli = do
contents <- readFile "E-coli.txt"
let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents
in
writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed)
-- write to repl
-- printList $ map dnaToString $ f $ stringToDna parsed
printList :: [String] -> IO [()]
printList l = sequence $ map putStrLn l
handleFile inName outName f = do
contents <- readFile inName
let processed = take ((length contents) - 1) contents
in writeFile outName $ (dnaToString . f . stringToDna) processed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment