Last active
August 29, 2015 13:56
-
-
Save alexpreynolds/8945214 to your computer and use it in GitHub Desktop.
Consolidating genomic elements by ID relationship
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys, copy | |
def sampleInput(): | |
return '''gain_765 chr15 9681969 9685418 chr15 9660912 9712719 loss_1136 | |
gain_766 chr15 9706682 9852347 chr15 9660912 9712719 loss_1136 | |
gain_766 chr15 9706682 9852347 chr15 9765125 9863990 loss_765 | |
gain_780 chr20 9706682 9852347 chr20 9765125 9863990 loss_769 | |
gain_760 chr15 9706682 9852347 chr15 9660912 9712719 loss_1137 | |
gain_760 chr15 9706682 9852347 chr15 9765125 9863990 loss_763''' | |
class Range: | |
"""Defines a generic genomic range""" | |
def __init__(self, chr, start, stop): | |
self.chr = str(chr) | |
self.start = int(start) | |
self.stop = int(stop) | |
def __str__(self): | |
return "[" + str(self.chr) + ":" + str(self.start) + "-" + str(self.stop) + "]" | |
class Entity: | |
"""Defines an entity""" | |
def __init__(self, name, range): | |
self.name = name | |
self.range = range | |
def __str__(self): | |
return self.name + " " + str(self.range) | |
def main(): | |
ranges = dict() | |
rels = dict() | |
for line in sampleInput().split('\n'): | |
elems = line.split('\t') | |
enA = Entity(elems[0], Range(elems[1], elems[2], elems[3])) | |
enB = Entity(elems[7], Range(elems[4], elems[5], elems[6])) | |
ranges[enA.name] = enA.range | |
ranges[enB.name] = enB.range | |
if len(rels) > 0: | |
if enA.name in rels: | |
rels[enA.name].add(enB.name) | |
if enB.name in rels: | |
rels[enB.name].add(enA.name) | |
else: | |
rels[enB.name] = set([enA.name]) | |
elif enB.name in rels: | |
rels[enB.name].add(enA.name) | |
if enA.name in rels: | |
rels[enA.name].add(enB.name) | |
else: | |
rels[enA.name] = set([enB.name]) | |
else: | |
rels[enA.name] = set([enB.name]) | |
rels[enB.name] = set([enA.name]) | |
else: | |
rels[enA.name] = set([enB.name]) | |
rels[enB.name] = set([enA.name]) | |
for k in rels: | |
s = rels[k] | |
ns = copy.deepcopy(s) | |
for e in s: | |
if e in rels: | |
os = rels[e] | |
ns = ns.union(os) | |
rels[k] = ns | |
nrels = dict() | |
for k in rels: | |
s = rels[k] | |
for e in s: | |
if e != k: | |
rels[e] = set() | |
elif len(rels[e]) > 0: | |
nrels[e] = s | |
for nk in nrels: | |
s = nrels[nk] | |
chr = "" | |
min = sys.maxint | |
max = -sys.maxint - 1 | |
for r in s: | |
chr = ranges[r].chr | |
if ranges[r].start < min: | |
min = ranges[r].start | |
if ranges[r].stop > max: | |
max = ranges[r].stop | |
print '\t'.join([chr, str(min), str(max), str(s)]) | |
if __name__ == "__main__": | |
main() |
Author
alexpreynolds
commented
Feb 11, 2014
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment