Created
August 20, 2010 07:20
-
-
Save bemasher/539807 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3, re, math, os | |
con = sqlite3.connect(':memory:') | |
db = con.cursor() | |
contigs = [425,424,3472,7154,6911,7165,6940,4912,421,422,423,7155] | |
# Set filename and open input | |
in_file = open("trimmed.ace", "r") | |
# Compile regex for parsing each line | |
AF_re = re.compile("AF\s(?P<name>.*?)\.(?P<range>\d+-\d+)(?P<link1>(?:\.fm|\.to)\d+)(?P<link2>(?:\.fm|\.to)\d+)?\s(?P<dir>[UC])") | |
# Create table | |
db.execute('''CREATE TABLE 'read_index' ('name' varchar(255) NOT NULL, 'contig' int(11) NOT NULL, 'range_low' int(11) NOT NULL, 'range_high' int(11) NOT NULL);''') | |
db.execute('''CREATE INDEX 'name' ON 'read_index' ('name')''') | |
db.execute('''CREATE INDEX 'contig' ON 'read_index' ('contig')''') | |
# Parse each line in the trimmed ace file and insert it into the database | |
for line in in_file: | |
if line.startswith("CO"): | |
# Current line is a contig, set current_contig value | |
current_contig = int(line.strip()[9:14]) | |
else: | |
# Current line is a read, parse it and insert it into the database with the current contig | |
match = AF_re.match(line.strip()) | |
read_name = match.group('name') | |
range = map(lambda x: int(x), match.group("range").split("-")) | |
db.execute("INSERT INTO read_index (name, contig, range_low, range_high) VALUES (?, ?, ?, ?);", (read_name, current_contig, range[0], range[1])) | |
# Commit changes to database | |
con.commit() | |
# SELECT a list of all reads contained in the database that have the contigs we're looking for | |
read_list = db.execute("SELECT DISTINCT * FROM read_index WHERE contig=%s" % (' OR contig='.join(map(lambda x: str(x), contigs[1:])))).fetchall() | |
# Make a dictionary of reads and their associated contigs | |
read_contigs = {} | |
for read in read_list: | |
read_contigs[str(read[0])] = map(lambda x: x[0], db.execute("SELECT contig FROM read_index WHERE name=?", (str(read[0]),)).fetchall()) | |
# Get the paths that each read traces out by sorting their ranges and contigs associated with each range | |
read_paths = {} | |
for read, contig_list in read_contigs.items(): | |
contigs = db.execute("SELECT contig FROM read_index WHERE name=? ORDER BY range_low, range_high", (read,)).fetchall() | |
path = '->'.join(map(lambda x: str(x[0]), contigs)) | |
try: | |
# Increment path counter | |
read_paths[path] += 1 | |
except: | |
# New path so set counter to 1 | |
read_paths[path] = 1 | |
# Close the database | |
db.close() | |
# Print each path and frequency | |
# for path, frequency in read_paths.items(): | |
# print path, frequency | |
for path in read_paths.keys(): | |
print path, len(path.split("->")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment