-
-
Save HerbM/946d25d0c704dcdc304a8747691cb7d2 to your computer and use it in GitHub Desktop.
open System | |
open System.IO | |
open System.Collections.Generic | |
open System.Text | |
open System.Text.RegularExpressions | |
#load "Library1.fs" | |
open Library1 | |
// Define your library scripting code here | |
let regex s = new Regex(s) | |
let (=~) s (re:Regex) = re.IsMatch(s) | |
let (<>~) s (re:Regex) = not (s =~ re) | |
let samplestring = "This is a string";; | |
let readLines filePath = System.IO.File.ReadLines("sample.txt");; | |
samplestring =~ regex "This" | |
let elementOf x e = Seq.exists ((=) e) x // elementOf "abc" 'c' | |
let filterString (s : string) (filter : string) = | |
s | |
|> Seq.filter (fun x -> elementOf filter x) | |
let readDNAdata = | |
"\dev\sample.txt" | |
|> System.IO.File.ReadAllLines | |
|> String.concat "" | |
let parseDNAdata (line : string) = | |
((regex "(?<=\d)(?=[ACTG])").Split(line)).[0..1] | |
let GCpercent (dnaData : string []) = | |
let id = dnaData.[0] | |
let dnaseq = dnaData.[1] | |
let allCount = dnaseq.Length | |
let GCcount = (filterString dnaseq "GC") |> Seq.length | |
(id, 100.0 * float(GCcount) / float(allCount), dnaseq) | |
let getDNAdata = | |
readDNAdata.Split '>' | |
|> Seq.map parseDNAdata | |
|> Seq.map GCpercent | |
getDNAdata |
can you upload the sample.txt?
I am guessing the out of bounds issue is happening on line 35 or 36 of GCpercent
. You can be a little safer with a pattern match:
let GCpercent (dnaData : string []) =
match dnaData with
| [|id; dnaseq|] ->
let allCount = dnaseq.Length
let GCcount = (filterString dnaseq "GC") |> Seq.length
(id, 100.0 * float(GCcount) / float(allCount), dnaseq) |> Some
| _ -> // Whatever you do when there is another size, maybe return None
None
If you just want to ignore dnaData of the wrong size, then you can filter the None's with Seq.choose id
baronfel asked for the data (ignore everything from hash line inclusive up:
The data doesn't come "all on one line so our main delimeter is the ">" that precedes each ID line.
The line breaks WITHIN a single element/string are in the original data.
There was no trash or spurious characters that we found.
Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
ninjarobot,
Thanks. I did some changes to that function to eliminate the out of bounds and agree it is (almost certainly) there.
Thanks. This was my first F# program so I wasn't immediately sure how to do the match...with and really appreciate your example and will review/use/adapt it.
Rosalind DNA Problem:
Read file
Clean & assemble DNA sequences (e.g., remove '>' assemble each DNA sequence with ID)
Convert text to: DNA_Sequences: ID string, DNA_String
For-Each DNA_Sequence
count C or G
count all
calculate CG_count / All_count -> CG_percent
Find max of CG_percent -> ID string, CG_percent