Skip to content

Instantly share code, notes, and snippets.

@jimhester
Created July 31, 2012 15:29
Show Gist options
  • Save jimhester/3217832 to your computer and use it in GitHub Desktop.
Save jimhester/3217832 to your computer and use it in GitHub Desktop.
Restriction enzyme regular expression performance
my %sites;
while(<>){
next if( 1 .. /Rich Roberts/);
next unless /\S/;
my(@names) = split /[\s\)\(]+/;
my $site = IUB_to_regexp(uc($names[-1]));
if(not exists $sites{$site} and not exists $sites{$site}){
print join("\t",$names[0],$site)."\n";
$sites{$site}++;
}
}
# Translate IUB ambiguity codes to regular expressions # IUB_to_regexp
#
# A subroutine that, given a sequence with IUB ambiguity codes,
# outputs a translation with IUB codes changed to regular expressions
#
# These are the IUB ambiguity codes
# (Eur. J. Biochem. 150: 1-5, 1985):
# R = G or A
# Y = C or T
# M = A or C
# K = G or T
# S = G or C
# W = A or T
# B = not A (C or G or T)
# D = not C (A or G or T)
# H = not G (A or C or T)
# V = not T (A or C or G)
# N = A or C or G or T
sub revcomIUB{
my $seq = shift;
$seq =~ tr/ACGTRYMKSWBDHVN/TGCAYRKMWSVHDBN/;
reverse $seq;
}
sub IUB_to_regexp {
my($iub) = @_;
my $regular_expression = '';
my %iub2character_class = (
A => 'A', C => 'C', G => 'G', T => 'T', R => '[GA]', Y => '[CT]', M => '[AC]',
K => '[GT]', S => '[GC]', W => '[AT]', B => '[CGT]', D => '[AGT]', H => '[ACT]',
V => '[ACG]', N => '[ACGT]',
);
$iub =~ s/\^//g;
for ( my $i = 0 ; $i < length($iub) ; ++$i ) {
$regular_expression .= $iub2character_class{substr($iub, $i, 1)};
}
return $regular_expression;
}
#!/usr/bin/perl
use strict;use warnings;
use Time::HiRes qw(gettimeofday tv_interval);
my $enzymeFile = shift;
my @enzymes;
open my $enzymeIn, $enzymeFile or die "$!:Could not open $enzymeFile\n";
while(<$enzymeIn>){
my($name, $enzyme) = split;
push @enzymes, [$name,$enzyme];
}
local $/ = ">";
my $first = <>;
while(my $record = <>){
chomp $record;
my $newline_loc = index($record,"\n");
my $header = substr($record,0,$newline_loc);
my $sequence = substr($record,$newline_loc+1);
$sequence = uc $sequence;
$sequence =~ tr/\n//d;
my $time = [ gettimeofday ];
for my $enzyme(@enzymes){
my $count = 0;
my $search = $enzyme->[1];
while($sequence =~ /$search/g){
$count++;
}
}
printf "perl: %.4f secs per enzyme\n" ,tv_interval($time)/@enzymes;
exit;
}
#!/usr/bin/env python
import os,sys,re
import time
def searchFasta(header,sequence):
start = time.time()
for enzyme in enzymes:
count = 0
count = len(enzyme[1].findall(sequence))
end = time.time()
return (end - start) / len(enzymes)
files = sys.argv
enzymeFile = files.pop(1)
enzymeIn = open(enzymeFile,'rU')
enzymes=[]
for line in enzymeIn:
name,enzyme = line.split()
enzymes.append((name,re.compile(enzyme)))
f = open(sys.argv[1],'rU')
header = f.readline()
header = header.rstrip(os.linesep)
sequence=''
for line in f:
line = line.rstrip('\n')
if(line[0] == '>'):
header = header[1:]
sequence = sequence.upper()
searchTime = searchFasta(header,sequence)
print "python: %.4f secs per enzyme" % (searchTime)
sys.exit(0)
header = line
sequence = ''
else:
sequence += line
#!/usr/bin/env ruby
enzymeFile = File.open(ARGV.shift)
enzymes = Array.new
enzymeFile.each do |line|
name, enzyme = line.split
enzymes << [ name, enzyme ]
end
$/ = ">"
ARGF.gets
while rec = ARGF.gets
rec.chomp!
nl = rec.index("\n")
header = rec[0..nl-1]
seq = rec[nl+1..-1]
seq.gsub!(/\n/,'')
seq.upcase!
start = Time.new()
for enzyme in enzymes
count = 0
seq.scan(/#{enzyme[1]}/){
count+=1
}
end
end_time = Time.new()
printf "ruby:\t%.4f secs per enzyme\n",((end_time - start)/enzymes.length())
exit
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment