Created
April 22, 2014 11:18
-
-
Save blahah/11174763 to your computer and use it in GitHub Desktop.
With warnings off
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'bio' | |
require 'inline' | |
require 'benchmark' | |
require 'trollop' | |
class TestC | |
def initialize(left, right) | |
@left = left | |
@right = right | |
end | |
inline :C do |builder| | |
builder.add_compile_flags %q(-w) | |
builder.include '<stdio.h>' | |
builder.include '<strings.h>' | |
builder.c <<SRC | |
void subsample(VALUE n, VALUE seed, VALUE left, VALUE right, VALUE leftout, VALUE rightout) { | |
char * filename_left; | |
char * filename_right; | |
char * outname_left; | |
char * outname_right; | |
char * line_l1 = NULL; | |
char * line_l2 = NULL; | |
char * line_l3 = NULL; | |
char * line_l4 = NULL; | |
char * line_r1 = NULL; | |
char * line_r2 = NULL; | |
char * line_r3 = NULL; | |
char * line_r4 = NULL; | |
size_t len = 0; | |
ssize_t read_l; | |
ssize_t read_r; | |
unsigned nc,i,seedc; | |
float a,r; | |
unsigned long count; | |
FILE *lfp; | |
FILE *rfp; | |
char *res_l = NULL; | |
char *res_r = NULL; | |
nc = NUM2INT(n); | |
res_l = (char*) realloc(res_l, nc * sizeof(char)); | |
res_r = (char*) realloc(res_r, nc * sizeof(char)); | |
seedc = NUM2INT(seed); | |
srand(seedc); | |
filename_left = StringValueCStr(left); | |
filename_right = StringValueCStr(right); | |
outname_left = StringValueCStr(leftout); | |
outname_right = StringValueCStr(rightout); | |
lfp = fopen(filename_left, "r"); | |
rfp = fopen(filename_right, "r"); | |
if (lfp == NULL) { | |
fprintf(stderr, "Cant open file!\\n"); | |
exit(1); | |
} | |
if (rfp == NULL) { | |
fprintf(stderr, "Cant open file!\\n"); | |
exit(1); | |
} | |
count=1; | |
while ((read_l = getline(&line_l1, &len, lfp)) != -1) { | |
read_l += getline(&line_l2, &len, lfp); | |
read_l += getline(&line_l3, &len, lfp); | |
read_l += getline(&line_l4, &len, lfp); | |
read_r = getline(&line_r1, &len, rfp); | |
read_r += getline(&line_r2, &len, rfp); | |
read_r += getline(&line_r3, &len, rfp); | |
read_r += getline(&line_r4, &len, rfp); | |
char * str_l = (char *) malloc(400); | |
char * str_r = (char *) malloc(400); | |
strcpy (str_l, line_l1); | |
strcat (str_l, line_l2); | |
strcat (str_l, line_l3); | |
strcat (str_l, line_l4); | |
strcpy (str_r, line_r1); | |
strcat (str_r, line_r2); | |
strcat (str_r, line_r3); | |
strcat (str_r, line_r4); | |
a = 1.0; | |
if (count <= nc) { | |
res_l[count-1] = str_l; | |
res_r[count-1] = str_r; | |
} else { | |
r = ((float)rand()/(float)(RAND_MAX)) * a; | |
if (r < (float)nc/(float)count) { | |
i = rand() % nc; | |
res_l[i] = str_l; | |
res_r[i] = str_r; | |
} else { | |
free(str_l); | |
free(str_r); | |
} | |
} | |
++count; | |
} | |
fclose(lfp); | |
fclose(rfp); | |
FILE *lout = fopen(outname_left, "w"); | |
FILE *rout = fopen(outname_right, "w"); | |
if (lout == NULL) { | |
printf("Error opening left file for writing\\n"); | |
exit(1); | |
} | |
if (rout == NULL) { | |
printf("Error opening right file for writing\\n"); | |
exit(1); | |
} | |
for(i=0; i<nc; i++) { | |
fprintf(lout,"%s",res_l[i]); | |
fprintf(rout,"%s",res_r[i]); | |
} | |
fclose(lout); | |
fclose(rout); | |
} | |
SRC | |
end | |
def ticker(i,speed) | |
n = 10**speed | |
if i <= 1 | |
print " "*9 | |
print "0" | |
end | |
if i % n == 0 | |
print "\b"*10 | |
string = "#{i}" | |
print " "*(10-string.length) | |
print string | |
end | |
end | |
def ruby_subsample(n, seed = 1337) | |
rng = Random.new seed | |
n = n.to_f | |
count = 1 | |
l = File.open(@left,"r") | |
r = File.open(@right,"r") | |
reservoir = [] | |
line1_left = l.readline.chomp | |
line2_left = l.readline.chomp | |
line3_left = l.readline.chomp | |
line4_left = l.readline.chomp | |
line1_right = r.readline.chomp | |
line2_right = r.readline.chomp | |
line3_right = r.readline.chomp | |
line4_right = r.readline.chomp | |
while line1_left != nil and line1_right != nil | |
ticker(count,3) | |
if count <= n | |
# fill the reservoir with the first | |
# n read pair | |
reservoir << [[line1_left, line2_left, line3_left, line4_left], [line1_right, line2_right, line3_right, line4_right]] | |
else | |
# select this read with probability n / m | |
if rng.rand < n / count | |
# replace a random item in the reservoir | |
i = rng.rand(n).to_i | |
reservoir[i] = [[line1_left, line2_left, line3_left, line4_left], [line1_right, line2_right, line3_right, line4_right]] | |
end | |
end | |
count += 1 | |
line1_left = l.readline.chomp rescue nil | |
line2_left = l.readline.chomp rescue nil | |
line3_left = l.readline.chomp rescue nil | |
line4_left = l.readline.chomp rescue nil | |
line1_right = r.readline.chomp rescue nil | |
line2_right = r.readline.chomp rescue nil | |
line3_right = r.readline.chomp rescue nil | |
line4_right = r.readline.chomp rescue nil | |
end | |
# write out the reservoir reads | |
ldir = File.dirname(@left) | |
loutfile = File.join(ldir, "subset.#{File.basename @left}") | |
lout = File.open(loutfile, 'w') | |
rdir = File.dirname(@right) | |
routfile = File.join(rdir, "subset.#{File.basename @right}") | |
rout = File.open(routfile, 'w') | |
count=0 | |
reservoir.each do |pair| | |
count+=1 | |
pair[0].each do |read| | |
lout.puts read | |
end | |
pair[1].each do |read| | |
rout.puts read | |
end | |
end | |
lout.close | |
rout.close | |
[loutfile, routfile] | |
end | |
end | |
if __FILE__ == $0 | |
opts = Trollop::options do | |
version "v0.0.1a" | |
opt :left, "First input fastq file", :required => true, :type => String | |
opt :right, "Second input fastq file", :required => true, :type => String | |
opt :verbose, "Be verbose" | |
end | |
Trollop::die :left, "must exist" if !File.exist?(opts[:left]) if opts[:left] | |
Trollop::die :right, "must exist" if !File.exist?(opts[:right]) if opts[:right] | |
test = TestC.new(opts.left, opts.right) | |
# 28_312_206 reads in BS-2-1.fq | |
Benchmark.bm(1) do |x| | |
x.report { | |
# outfiles = test.ruby_subsample 100_000 | |
test.subsample(2000,1337, opts.left, opts.right, "outfile_left.fastq", "outfile_right.fastq") | |
} | |
end | |
# x.report { | |
# 1.times do | |
# test.subsample(25000, opts.left, opts.right, "outfile_left.fastq", "outfile_right.fastq") | |
# end | |
# } | |
# end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment