Skip to content

Instantly share code, notes, and snippets.

@blahah
Created April 22, 2014 11:18
Show Gist options
  • Save blahah/11174763 to your computer and use it in GitHub Desktop.
Save blahah/11174763 to your computer and use it in GitHub Desktop.
With warnings off
#!/usr/bin/env ruby
require 'bio'
require 'inline'
require 'benchmark'
require 'trollop'
class TestC
def initialize(left, right)
@left = left
@right = right
end
inline :C do |builder|
builder.add_compile_flags %q(-w)
builder.include '<stdio.h>'
builder.include '<strings.h>'
builder.c <<SRC
void subsample(VALUE n, VALUE seed, VALUE left, VALUE right, VALUE leftout, VALUE rightout) {
char * filename_left;
char * filename_right;
char * outname_left;
char * outname_right;
char * line_l1 = NULL;
char * line_l2 = NULL;
char * line_l3 = NULL;
char * line_l4 = NULL;
char * line_r1 = NULL;
char * line_r2 = NULL;
char * line_r3 = NULL;
char * line_r4 = NULL;
size_t len = 0;
ssize_t read_l;
ssize_t read_r;
unsigned nc,i,seedc;
float a,r;
unsigned long count;
FILE *lfp;
FILE *rfp;
char *res_l = NULL;
char *res_r = NULL;
nc = NUM2INT(n);
res_l = (char*) realloc(res_l, nc * sizeof(char));
res_r = (char*) realloc(res_r, nc * sizeof(char));
seedc = NUM2INT(seed);
srand(seedc);
filename_left = StringValueCStr(left);
filename_right = StringValueCStr(right);
outname_left = StringValueCStr(leftout);
outname_right = StringValueCStr(rightout);
lfp = fopen(filename_left, "r");
rfp = fopen(filename_right, "r");
if (lfp == NULL) {
fprintf(stderr, "Cant open file!\\n");
exit(1);
}
if (rfp == NULL) {
fprintf(stderr, "Cant open file!\\n");
exit(1);
}
count=1;
while ((read_l = getline(&line_l1, &len, lfp)) != -1) {
read_l += getline(&line_l2, &len, lfp);
read_l += getline(&line_l3, &len, lfp);
read_l += getline(&line_l4, &len, lfp);
read_r = getline(&line_r1, &len, rfp);
read_r += getline(&line_r2, &len, rfp);
read_r += getline(&line_r3, &len, rfp);
read_r += getline(&line_r4, &len, rfp);
char * str_l = (char *) malloc(400);
char * str_r = (char *) malloc(400);
strcpy (str_l, line_l1);
strcat (str_l, line_l2);
strcat (str_l, line_l3);
strcat (str_l, line_l4);
strcpy (str_r, line_r1);
strcat (str_r, line_r2);
strcat (str_r, line_r3);
strcat (str_r, line_r4);
a = 1.0;
if (count <= nc) {
res_l[count-1] = str_l;
res_r[count-1] = str_r;
} else {
r = ((float)rand()/(float)(RAND_MAX)) * a;
if (r < (float)nc/(float)count) {
i = rand() % nc;
res_l[i] = str_l;
res_r[i] = str_r;
} else {
free(str_l);
free(str_r);
}
}
++count;
}
fclose(lfp);
fclose(rfp);
FILE *lout = fopen(outname_left, "w");
FILE *rout = fopen(outname_right, "w");
if (lout == NULL) {
printf("Error opening left file for writing\\n");
exit(1);
}
if (rout == NULL) {
printf("Error opening right file for writing\\n");
exit(1);
}
for(i=0; i<nc; i++) {
fprintf(lout,"%s",res_l[i]);
fprintf(rout,"%s",res_r[i]);
}
fclose(lout);
fclose(rout);
}
SRC
end
def ticker(i,speed)
n = 10**speed
if i <= 1
print " "*9
print "0"
end
if i % n == 0
print "\b"*10
string = "#{i}"
print " "*(10-string.length)
print string
end
end
def ruby_subsample(n, seed = 1337)
rng = Random.new seed
n = n.to_f
count = 1
l = File.open(@left,"r")
r = File.open(@right,"r")
reservoir = []
line1_left = l.readline.chomp
line2_left = l.readline.chomp
line3_left = l.readline.chomp
line4_left = l.readline.chomp
line1_right = r.readline.chomp
line2_right = r.readline.chomp
line3_right = r.readline.chomp
line4_right = r.readline.chomp
while line1_left != nil and line1_right != nil
ticker(count,3)
if count <= n
# fill the reservoir with the first
# n read pair
reservoir << [[line1_left, line2_left, line3_left, line4_left], [line1_right, line2_right, line3_right, line4_right]]
else
# select this read with probability n / m
if rng.rand < n / count
# replace a random item in the reservoir
i = rng.rand(n).to_i
reservoir[i] = [[line1_left, line2_left, line3_left, line4_left], [line1_right, line2_right, line3_right, line4_right]]
end
end
count += 1
line1_left = l.readline.chomp rescue nil
line2_left = l.readline.chomp rescue nil
line3_left = l.readline.chomp rescue nil
line4_left = l.readline.chomp rescue nil
line1_right = r.readline.chomp rescue nil
line2_right = r.readline.chomp rescue nil
line3_right = r.readline.chomp rescue nil
line4_right = r.readline.chomp rescue nil
end
# write out the reservoir reads
ldir = File.dirname(@left)
loutfile = File.join(ldir, "subset.#{File.basename @left}")
lout = File.open(loutfile, 'w')
rdir = File.dirname(@right)
routfile = File.join(rdir, "subset.#{File.basename @right}")
rout = File.open(routfile, 'w')
count=0
reservoir.each do |pair|
count+=1
pair[0].each do |read|
lout.puts read
end
pair[1].each do |read|
rout.puts read
end
end
lout.close
rout.close
[loutfile, routfile]
end
end
if __FILE__ == $0
opts = Trollop::options do
version "v0.0.1a"
opt :left, "First input fastq file", :required => true, :type => String
opt :right, "Second input fastq file", :required => true, :type => String
opt :verbose, "Be verbose"
end
Trollop::die :left, "must exist" if !File.exist?(opts[:left]) if opts[:left]
Trollop::die :right, "must exist" if !File.exist?(opts[:right]) if opts[:right]
test = TestC.new(opts.left, opts.right)
# 28_312_206 reads in BS-2-1.fq
Benchmark.bm(1) do |x|
x.report {
# outfiles = test.ruby_subsample 100_000
test.subsample(2000,1337, opts.left, opts.right, "outfile_left.fastq", "outfile_right.fastq")
}
end
# x.report {
# 1.times do
# test.subsample(25000, opts.left, opts.right, "outfile_left.fastq", "outfile_right.fastq")
# end
# }
# end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment