-
-
Save behe/b741c6ae7ff567b2d307 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule ATGCCount do | |
def count(sequence), do: cnt(String.to_char_list(sequence),0,0) | |
def cnt([65|t],at,gc), do: cnt(t,at+1,gc) | |
def cnt([84|t],at,gc), do: cnt(t,at+1,gc) | |
def cnt([71|t],at,gc), do: cnt(t,at,gc+1) | |
def cnt([67|t],at,gc), do: cnt(t,at,gc+1) | |
def cnt([62|_],at,gc), do: {at,gc} | |
def cnt([],at,gc), do: {at,gc} | |
# def cnt(_,0,0), do: {0,0} | |
def cnt([_|t], at, gc), do: cnt(t,at,gc) | |
end | |
defmodule AsyncATGCCount do | |
def count do | |
receive do | |
{sender, sequence} -> | |
send sender, { :ok, _count(sequence, 0, 0) } | |
end | |
count | |
end | |
def _count(<<?A, t :: binary>>, at, gc), do: _count(t, at+1, gc) | |
def _count(<<?T, t :: binary>>, at, gc), do: _count(t, at+1, gc) | |
def _count(<<?G, t :: binary>>, at, gc), do: _count(t, at, gc+1) | |
def _count(<<?C, t :: binary>>, at, gc), do: _count(t, at, gc+1) | |
def _count(<<?>, _ :: binary>>, at, gc), do: { at, gc } | |
def _count("", at, gc), do: { at, gc } | |
def _count(<<_, t :: binary>>, at, gc), do: _count(t, at, gc) | |
end | |
defmodule GCCount do | |
def process(filename) do | |
if File.exists?(filename) do | |
map File.stream!(filename, [:read_ahead, :raw], :line) | |
end | |
end | |
def map(stream) do | |
Enum.reduce stream, {0, 0}, fn(line, {at_acc, gc_acc}) -> | |
{at, gc} = ATGCCount.count(line) | |
{at_acc + at, gc_acc + gc} | |
end | |
end | |
def gc_ratio do | |
{at, gc} = process "chry.fa" | |
case {at, gc} do | |
{0, 0} -> 0 | |
{at, gc} -> gc/(at+gc) | |
end | |
end | |
end | |
defmodule AsyncGCCount do | |
@read_size 1_00_000 | |
@processes 10 | |
@pids Enum.reduce 1..@processes, [], fn(_, pids) -> | |
[spawn(AsyncATGCCount, :count, [])] ++ pids | |
end | |
def process(filename) do | |
if File.exists?(filename) do | |
map File.stream!(filename, [:read_ahead, :raw], @read_size) | |
end | |
end | |
def map(stream) do | |
Stream.cycle(@pids) |> | |
Stream.zip(stream) |> | |
Enum.each fn({pid, line}) -> | |
send pid, {self(), line} | |
end | |
collect {0, 0} | |
end | |
def collect {ac,gt} do | |
receive do | |
{:ok, atgc} -> | |
collect {ac + elem(atgc,0), gt + elem(atgc, 1)} | |
after | |
10 -> {ac,gt} | |
end | |
end | |
def gc_ratio do | |
{at, gc} = process "chry.fa" | |
case {at, gc} do | |
{0, 0} -> 0 | |
{at, gc} -> gc/(at+gc) | |
end | |
end | |
end | |
ExUnit.start | |
defmodule CountTest do | |
use ExUnit.Case | |
test "can count AT/GC ratio" do | |
assert GCCount.map(["ATGCAT"]) == {4, 2} | |
end | |
test "ignores N from ratio" do | |
assert GCCount.map(["AT","GC","NN","AT"]) == {4,2} | |
end | |
test "counts ATGC after N" do | |
assert GCCount.map(["NATGCAT"]) == {4,2} | |
end | |
test "ignores > lines from ratio" do | |
assert GCCount.map(["> ignore me","ATGCAT"]) == {4,2} | |
end | |
test "3M sync" do | |
{time, result} = :timer.tc(GCCount, :process, ["chry3M.fa"]) | |
IO.puts "3M sync #{time/1_000_000}" | |
assert result == {80115, 44757} | |
end | |
test "everything sync" do | |
{time, result} = :timer.tc(GCCount, :process, ["chry.fa"]) | |
IO.puts "all sync #{time/1_000_000}" | |
assert result == {5352980, 3228502} | |
end | |
end | |
defmodule AsyncCountTest do | |
use ExUnit.Case | |
test "can count AT/GC ratio" do | |
assert AsyncGCCount.map(["ATGCAT"]) == {4, 2} | |
end | |
test "ignores N from ratio" do | |
assert AsyncGCCount.map(["AT","GC","NN","AT"]) == {4,2} | |
end | |
test "counts ATGC after N" do | |
assert AsyncGCCount.map(["NATGCAT"]) == {4,2} | |
end | |
test "ignores > lines from ratio" do | |
assert AsyncGCCount.map(["> ignore me","ATGCAT"]) == {4,2} | |
end | |
test "3M async" do | |
{time, result} = :timer.tc(AsyncGCCount, :process, ["chry3M.fa"]) | |
IO.puts "3M async #{time/1_000_000}" | |
assert result == {80115, 44757} | |
end | |
test "everything async" do | |
{time, result} = :timer.tc(AsyncGCCount, :process, ["chry.fa"]) | |
IO.puts "all async #{time/1_000_000}" | |
assert result == {5352980, 3228502} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The example file chry.fa is from ftp://ftp.ensembl.org/pub/release-67/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa.gz and chry3M.fa is a 3Mb chunk of that file created with
head -c $((1024*3000)) chry.fa > chry3M.fa