lambdaofgod · July 16, 2015 00:32
diff --git a/bioinfo.hs b/bioinfo.hs
 module Chapter1 where

 --import Data.Map 
 import Control.Monad
 import Data.List
 import Data.Char

 data Base = A | G | C | T  deriving(Eq, Show)
 bases = [A,G,C,T]
 type DNA = [Base]

 complement A = T
 complement T = A
 complement G = C
 complement C = G

 charToBase 'A' = A
 charToBase 'G' = G
 charToBase 'C' = C
 charToBase 'T' = T

 stringToDna = map charToBase
 dnaToString :: DNA -> String
 dnaToString = foldl (\x y -> x ++ (show y)) ""

 boolAsInt :: Bool -> Int
 boolAsInt True = 1
 boolAsInt False = 0

 -- how many occurrences of pt are in strand?
 patternCount :: [Base] -> [Base] -> Int
 patternCount strand pt = 
    if length strand < length pt
        then 0
    else
        (boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt 

 kmers :: Int -> [DNA]
 kmers 1 = map (\x -> [x]) [A,G,C,T]
 kmers n = [b : kmer |  kmer <- kmers (n-1), b <- bases]

 kmersIn :: Int -> DNA -> [DNA]
 kmersIn n s =
    if (length s) < n 
        then []
    else
        (take n s) : (kmersIn n (tail s))

 mostFrequentKmers :: Int -> DNA -> [DNA]
 mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers
    where 
        m = foldl max 0 (map (patternCount strand) nmers)
        nmers = nub $ kmersIn n strand

 kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int)
 kmerWithIndex length i strand = ((take length strand),i)

 kmer :: Int -> Int -> DNA -> DNA
 kmer len i strand = fst $ kmerWithIndex len i strand

 kmersWithIndices :: Int -> DNA -> [(DNA,Int)]
 kmersWithIndices n = kmersWithIndicesTail n 0 
    where 
        kmersWithIndicesTail n i strand = 
            if (length strand) < n 
                then []
            else
                (kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand))

 reverseComplement = (map complement . reverse) 

 clumpFinding :: Int -> Int -> Int -> DNA -> [DNA]
 clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux 
    where
    aux = kmersWithIndices k strand
    kmersInWindow lst = result 
        where
        hd = head lst
        inWindow p = (abs ((snd p) - (snd hd)) <= l)
        result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst)
    allFormingWindows acc [] = acc
    allFormingWindows acc xs = 
        if t <= (length $ kmersInWindow xs)
            then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs)
        else
            allFormingWindows acc (tail xs)
 {-
    WEEK 1 CS
 -}

 baseToInt A = 0
 baseToInt C = 1
 baseToInt G = 2
 baseToInt T = 3

 patternToNumber = helper . reverse
    where 
    helper [] = 0
    helper (b : bs) = 4*(helper bs) + (baseToInt b) 


 {-
    WEEK 2
 -}

 skew :: DNA -> [Int] 
 skew = skewTail 0 
    where  
        skewTail n [] = []
        skewTail n (C : rest) = n : (skewTail (n-1) rest) 
        skewTail n (G : rest) = n :  (skewTail (n+1) rest)
        skewTail n (ch : rest) = n : (skewTail n rest)

 minimumSkew :: DNA -> [Int]
 minimumSkew [] = []
 minimumSkew strand = minimalIndices
    where
        helper :: Int -> Int -> [Int] -> [Int]
        helper i m [] = []
        helper i m (x : xs) = 
            if (m == x)
                then i : (helper (i+1) m xs)
            else
                helper (i+1) m xs
        m = foldr min (head $ skew strand) (skew strand)
        minimalIndices = helper 0 m (skew strand)

 -- these are without replication
 comb :: Int -> [a] -> [[a]]
 comb 0 as = [[]]
 comb n as = 
    if length as < n
        then []
    else
       (map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as))

 combWithRep :: Int -> [a] -> [[a]]
 combWithRep 0 as = [[]]
 combWithRep 1 as = [[a] | a <- as]
 combWithRep n as = [ a : prev | a <- as, prev <- combWithRep (n-1) as]

 dist [] _ = 0
 dist _ [] = 0 
 dist (x : xs) (y : ys) =
    (boolAsInt (x /= y)) + dist xs ys 

 approximateOccurrences :: DNA -> Int -> DNA -> [Int]
 approximateOccurrences pt mismatch = helper 0 [] 
    where
        n = length pt
        helper i acc strand = 
            if (length strand) < (length pt)
                then acc 
            else
                let 
                    substr = take n strand
                in
                    if  ((dist substr pt) <= mismatch)
                        then helper (i + 1) (acc ++ [i]) (tail strand) 
                    else
                        helper (i + 1) acc (tail strand) 

 countPatternWithMismatches :: DNA -> Int -> DNA -> Int
 countPatternWithMismatches pt mismatch = helper 0 
    where
        helper i strand = 
            if (length strand) < (length pt)
                then i
            else
                if ((dist strand pt) <= mismatch)
                then helper (i + 1) (tail strand) 
                else helper i (tail strand)

 {-
 frequentWithMismatches :: DNA -> Int -> Int -> DNA
 frequentWithMismatches strand length mismatches = 
    where occurring = kmers length strand
    -- it seems that we need to find all mismatched words and then choose the which occurs the most
 -}

 -- helpers
 f = clumpFinding 9 500 3
 clumpsInEcoli = do 
    contents <- readFile "E-coli.txt"
    let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents 
        in 
        writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed)
 
 -- write to repl
 -- printList $ map dnaToString $ f $ stringToDna parsed

 printList :: [String] -> IO [()]
 printList l = sequence $ map putStrLn l

 handleFile inName outName f = do
    contents <- readFile inName
    let processed = take ((length contents) - 1) contents
        in writeFile outName $ (dnaToString . f . stringToDna) processed
	module Chapter1 where

	--import Data.Map
	import Control.Monad
	import Data.List
	import Data.Char

	data Base = A \| G \| C \| T deriving(Eq, Show)
	bases = [A,G,C,T]
	type DNA = [Base]

	complement A = T
	complement T = A
	complement G = C
	complement C = G

	charToBase 'A' = A
	charToBase 'G' = G
	charToBase 'C' = C
	charToBase 'T' = T

	stringToDna = map charToBase
	dnaToString :: DNA -> String
	dnaToString = foldl (\x y -> x ++ (show y)) ""

	boolAsInt :: Bool -> Int
	boolAsInt True = 1
	boolAsInt False = 0

	-- how many occurrences of pt are in strand?
	patternCount :: [Base] -> [Base] -> Int
	patternCount strand pt =
	if length strand < length pt
	then 0
	else
	(boolAsInt ((take (length pt) strand) == pt)) + patternCount (tail strand) pt

	kmers :: Int -> [DNA]
	kmers 1 = map (\x -> [x]) [A,G,C,T]
	kmers n = [b : kmer \| kmer <- kmers (n-1), b <- bases]

	kmersIn :: Int -> DNA -> [DNA]
	kmersIn n s =
	if (length s) < n
	then []
	else
	(take n s) : (kmersIn n (tail s))

	mostFrequentKmers :: Int -> DNA -> [DNA]
	mostFrequentKmers n strand = filter (\x -> patternCount strand x == m) nmers
	where
	m = foldl max 0 (map (patternCount strand) nmers)
	nmers = nub $ kmersIn n strand

	kmerWithIndex :: Int -> Int -> DNA -> (DNA,Int)
	kmerWithIndex length i strand = ((take length strand),i)

	kmer :: Int -> Int -> DNA -> DNA
	kmer len i strand = fst $ kmerWithIndex len i strand

	kmersWithIndices :: Int -> DNA -> [(DNA,Int)]
	kmersWithIndices n = kmersWithIndicesTail n 0
	where
	kmersWithIndicesTail n i strand =
	if (length strand) < n
	then []
	else
	(kmerWithIndex n i strand) : (kmersWithIndicesTail n (i+1) (tail strand))

	reverseComplement = (map complement . reverse)

	clumpFinding :: Int -> Int -> Int -> DNA -> [DNA]
	clumpFinding k l t strand = nub $ map fst $ allFormingWindows [] aux
	where
	aux = kmersWithIndices k strand
	kmersInWindow lst = result
	where
	hd = head lst
	inWindow p = (abs ((snd p) - (snd hd)) <= l)
	result = filter (\p -> (fst p) == (fst hd)) (takeWhile inWindow lst)
	allFormingWindows acc [] = acc
	allFormingWindows acc xs =
	if t <= (length $ kmersInWindow xs)
	then allFormingWindows ((head $ kmersInWindow xs) : acc) (tail xs)
	else
	allFormingWindows acc (tail xs)
	{-
	WEEK 1 CS
	-}

	baseToInt A = 0
	baseToInt C = 1
	baseToInt G = 2
	baseToInt T = 3

	patternToNumber = helper . reverse
	where
	helper [] = 0
	helper (b : bs) = 4*(helper bs) + (baseToInt b)


	{-
	WEEK 2
	-}

	skew :: DNA -> [Int]
	skew = skewTail 0
	where
	skewTail n [] = []
	skewTail n (C : rest) = n : (skewTail (n-1) rest)
	skewTail n (G : rest) = n : (skewTail (n+1) rest)
	skewTail n (ch : rest) = n : (skewTail n rest)

	minimumSkew :: DNA -> [Int]
	minimumSkew [] = []
	minimumSkew strand = minimalIndices
	where
	helper :: Int -> Int -> [Int] -> [Int]
	helper i m [] = []
	helper i m (x : xs) =
	if (m == x)
	then i : (helper (i+1) m xs)
	else
	helper (i+1) m xs
	m = foldr min (head $ skew strand) (skew strand)
	minimalIndices = helper 0 m (skew strand)

	-- these are without replication
	comb :: Int -> [a] -> [[a]]
	comb 0 as = [[]]
	comb n as =
	if length as < n
	then []
	else
	(map (\l -> (head as) : l) (comb (n-1) (tail as))) ++ (comb n (tail as))

	combWithRep :: Int -> [a] -> [[a]]
	combWithRep 0 as = [[]]
	combWithRep 1 as = [[a] \| a <- as]
	combWithRep n as = [ a : prev \| a <- as, prev <- combWithRep (n-1) as]

	dist [] _ = 0
	dist _ [] = 0
	dist (x : xs) (y : ys) =
	(boolAsInt (x /= y)) + dist xs ys

	approximateOccurrences :: DNA -> Int -> DNA -> [Int]
	approximateOccurrences pt mismatch = helper 0 []
	where
	n = length pt
	helper i acc strand =
	if (length strand) < (length pt)
	then acc
	else
	let
	substr = take n strand
	in
	if ((dist substr pt) <= mismatch)
	then helper (i + 1) (acc ++ [i]) (tail strand)
	else
	helper (i + 1) acc (tail strand)

	countPatternWithMismatches :: DNA -> Int -> DNA -> Int
	countPatternWithMismatches pt mismatch = helper 0
	where
	helper i strand =
	if (length strand) < (length pt)
	then i
	else
	if ((dist strand pt) <= mismatch)
	then helper (i + 1) (tail strand)
	else helper i (tail strand)

	{-
	frequentWithMismatches :: DNA -> Int -> Int -> DNA
	frequentWithMismatches strand length mismatches =
	where occurring = kmers length strand
	-- it seems that we need to find all mismatched words and then choose the which occurs the most
	-}

	-- helpers
	f = clumpFinding 9 500 3
	clumpsInEcoli = do
	contents <- readFile "E-coli.txt"
	let parsed = filter (\x -> elem x ['A', 'G', 'C' , 'T']) contents
	in
	writeFile "out" (intercalate " " $ map dnaToString $ f $ stringToDna parsed)

	-- write to repl
	-- printList $ map dnaToString $ f $ stringToDna parsed

	printList :: [String] -> IO [()]
	printList l = sequence $ map putStrLn l

	handleFile inName outName f = do
	contents <- readFile inName
	let processed = take ((length contents) - 1) contents
	in writeFile outName $ (dnaToString . f . stringToDna) processed