Created
August 11, 2009 03:41
-
-
Save bemasher/165610 to your computer and use it in GitHub Desktop.
Compares a list of chromosome regions from one file to a list of probes generated for NimbleGen.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Compares a list of chromosome regions from one file to a list of probes generated for NimbleGen.""" | |
from optparse import OptionParser | |
import math | |
parser = OptionParser() | |
parser.usage = "%prog file1 file2 [options]" | |
parser.add_option("-r", "--range", type="int", dest="range", metavar="INT", default="200", help="set range tolerance [default: %default]") | |
parser.add_option("-p", "--percentage", action="store_true", help="display percentage of range covered instead of difference") | |
gen_chrX = [] | |
gen_chrY = [] | |
(options, args) = parser.parse_args() | |
def find_gaps(needle, region, haystack): | |
matches = [] | |
# For every probe in region of haystack | |
for index in region: | |
# Create a list of all integers in it | |
matches[-1:] = range(haystack[index][0], haystack[index][1] + 1) | |
# Use set logic to find the difference | |
return len(set(range(needle[0], needle[1])) - set(matches)) | |
def binary_match(needle, haystack): | |
# Find mid index | |
mid = len(haystack)/2 | |
# If needle falls inside the region in the middle of the haystack, we're done, or if it's the last one | |
if (needle > haystack[0] and needle < haystack[1]) or (len(haystack) == 1): | |
return haystack[mid] | |
else: | |
# If needle is less than the average of the middle probe's values | |
if needle < (haystack[mid][0] + haystack[mid][1]) / 2: | |
# Search bottom half | |
return binary_match(needle, haystack[:mid]) | |
else: | |
# Search top half | |
return binary_match(needle, haystack[mid:]) | |
def perform_checks(needle, haystack): | |
# Look for matches of low and high bounds of needle | |
match_low, match_high = binary_match(needle[0], haystack), binary_match(needle[1], haystack) | |
# Find the indexes of the matches we just found | |
index_low, index_high = haystack.index(match_low), haystack.index(match_high) | |
# If we're not at the first index, expand difference search down one index | |
if index_low > 0: | |
index_low -= 1 | |
else: | |
index_low = 0 | |
# If we're not at the very top index, expand difference search up one index | |
if index_high < len(haystack) - 1: | |
index_high += 1 | |
else: | |
index_high = len(haystack) - 1 | |
# Find gaps the regions we've just selected don't cover in the needle | |
return find_gaps(needle, range(index_low, index_high + 1), haystack) | |
def display_result(name, needle, result): | |
# Does result pass given range? | |
if result < options.range: | |
bool = "PASS" | |
else: | |
bool = "FAIL" | |
# If percentage specified then find percentage of needle covered | |
if options.percentage: | |
needle_len = needle[1] - needle[0] | |
result = (float(needle_len - result) / float(needle[1] - needle[0])) * 100 | |
print "%s\t%s\t%s" % (name, result, bool) | |
# Print usage and exit if user doesn't give both input file arguments | |
if len(args) < 2: | |
parser.print_help() | |
raise SystemExit | |
# Attempt to open the probes generated to check against | |
try: | |
generated_regions = open(args[1], "r") | |
except IOError: | |
# We couldn't find the file, so exit | |
print "Could not find file:", args[1] | |
raise SystemExit | |
# Read through file until we get to relavent probes | |
for line in generated_regions: | |
try: | |
# We can ignore all data before NimbleGen probes | |
line.index("NimbleGen") | |
break | |
except ValueError: | |
pass | |
# Create lists of X and Y probes | |
for line in generated_regions: | |
temp = line.split("\t") | |
try: | |
# If it's a X chromosome add to gen_chrX | |
temp[0].index("chrX") | |
gen_chrX.append((int(temp[1]), int(temp[2]))) | |
except ValueError: | |
# If it's a Y chromosome add to gen_chrY | |
gen_chrY.append((int(temp[1]), int(temp[2]))) | |
# We're done with this file, close it | |
generated_regions.close() | |
# Attempt to open the file we're trying to find matches for | |
try: | |
request_regions = open(args[0], 'r') | |
except IOError: | |
# We couldn't open the file, so exit | |
print "Could not find file:", args[0] | |
raise SystemExit | |
# For each region parse and check against the X and Y probe lists | |
for line in request_regions: | |
line = line.replace("\n", "") | |
line = line.split("\t") | |
# Extract bounds of needle from line and cast to int | |
needle = (int(line[1]), int(line[2])) | |
# If the chromosome is X use X probes list, else use Y probes list | |
if line[0] == "chrX": | |
result = perform_checks(needle, gen_chrX) | |
display_result(line[3], needle, result) | |
else: | |
result = perform_checks(needle, gen_chrY) | |
display_result(line[3], needle, result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment