Last active
August 11, 2021 16:14
-
-
Save lennax/3172753 to your computer and use it in GitHub Desktop.
coordinate mapper by Reece Hart: http://lists.open-bio.org/pipermail/biopython/2010-June/006598.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CoordinateMapper -- map between genomic, cds, and protein coordinates | |
# AUTHOR: Reece Hart <[email protected]> | |
# Modifications: Lenna X. Peterson <[email protected]> | |
# LICENSE: BioPython | |
# Examples: | |
# AB026906.1:g.7872G>T | |
# AB026906.1:c.274G>T | |
# BA...:p.Asp92Tyr | |
# | |
# All refer to congruent variants. A coordinate mapper is needed to | |
# translate between at least these three coordinate frames. The mapper | |
# should deal with specialized syntax for splicing and UTR (e.g., 88+1, | |
# 89-2, -14, *46). In addition, care should be taken to ensure consistent 0- | |
# or 1-based numbering (0 internally, as with Python/BioPython and | |
# Perl/BioPerl). | |
# | |
# g -----------00000000000-----1111111----------22222222222*222----- | |
# s0 e0 s1 e1 s2 e2 | |
# \ \ | | / / | |
# +--+ +--+ | | +-------+ +-------+ | |
# \ \| |/ / | |
# c 00000000000111111122222222222*222 | |
# c0 c1 c2 | |
# aaabbbcccdddeeefffggghhhiiijj*kkk | |
# p A B C D E F G H I J K | |
# p0 p1 p2 ... pn | |
# | |
# | |
# TODO: | |
# * g2c returns index + extended syntax | |
# * c2g accepts index + extended syntax | |
from functools import wraps | |
from math import floor | |
import warnings | |
import MapPositions # required for pos_factory decorator | |
from MapPositions import CDSPositionError, ProteinPositionError | |
from MapPositions import GenomePosition, CDSPosition, ProteinPosition | |
from Bio import BiopythonParserWarning | |
# FIXME change to absolute import once CompoundLocation is merged to master | |
from SeqFeature import FeatureLocation | |
class CoordinateMapper(object): | |
"""Convert positions between genomic, CDS, and protein coordinates.""" | |
def __init__(self, selist): | |
"""Set exons to be used for mapping. | |
Parameters | |
---------- | |
selist : SeqRecord, SeqFeature, list | |
Object containing exon information | |
""" | |
self._exons = self._get_exons(selist) | |
@staticmethod | |
def _get_exons(seq): | |
"""Extract exons from SeqRecord, SeqFeature, or list of pairs. | |
Parameters | |
---------- | |
seq : SeqRecord, SeqFeature, list | |
Object containing exon information. | |
Returns | |
------- | |
SeqFeature.FeatureLocation | |
""" | |
# Try as SeqRecord | |
if hasattr(seq, 'features'): | |
# generator | |
cdsf = (f for f in seq.features if f.type == 'CDS').next() | |
return cdsf.location | |
# Try as SeqFeature | |
elif hasattr(seq, 'location'): | |
if seq.type != 'CDS': | |
# FIXME should this be a fatal error? | |
warnings.warn("Provided SeqFeature should be CDS", | |
BiopythonParserWarning) | |
return seq.location | |
# Try as list of pairs | |
return sum([FeatureLocation(s, e) for s, e in seq]) | |
@property # read-only | |
def exons(self): | |
return self._exons | |
@property # read-only | |
def exon_list(self): | |
return list(self.exons) | |
def pos_factory(pos_type): | |
""" | |
Convert string or int pos to appropriate Position object | |
Parameters | |
---------- | |
pos_type : str | |
Position type (Genome, CDS, Protein) | |
""" | |
def wrapper(fn): | |
@wraps(fn) | |
def make_pos(self, pos, dialect=None): | |
# retrieve Position object | |
_obj = getattr(MapPositions, pos_type + "Position") | |
# if pos is not already Position object, make it one | |
if not isinstance(pos, _obj): | |
# no dialect: use default constructor | |
if dialect is None: | |
pos = _obj(pos) | |
# use dialect alternate constructor | |
else: | |
pos = _obj.from_dialect(dialect, pos) | |
# call function with new pos | |
return fn(self, pos, dialect) | |
return make_pos | |
return wrapper | |
@pos_factory("Genome") | |
def g2c(self, gpos, dialect=None): | |
"""Convert integer from genomic to CDS coordinates | |
Parameters | |
---------- | |
gpos : int | |
Genomic position | |
dialect : str | |
Coordinate dialect (GenBank or HGVS, default None) | |
Returns | |
------- | |
CDSPosition | |
""" | |
gpos = int(gpos) | |
fmts = CDSPosition.fmt_dict | |
def _simple_g2c(g): | |
return CDSPosition(self.exon_list.index(g)) | |
# within exon | |
if gpos in self.exons: | |
return _simple_g2c(gpos) | |
# before CDS | |
if gpos < self.exons.start: | |
return CDSPosition(fmts['post-CDS'] % (gpos - self.exons.start)) | |
# after CDS | |
if gpos >= self.exons.end: | |
return CDSPosition(fmts['pre-CDS'] % (gpos - self.exons.end + 1)) | |
# intron | |
# set start of first intron | |
prev_end = self.exons.parts[0].end | |
for part in self.exons.parts[1:]: | |
# not in this intron | |
if gpos > part.start: | |
prev_end = part.end | |
continue | |
len_intron = part.start - prev_end | |
# second half (exclusive) of intron | |
# offset > middle of intron | |
if gpos - prev_end > floor(len_intron / 2.0): | |
anchor = _simple_g2c(part.start) | |
offset = gpos - part.start | |
assert offset < 0 | |
# first half (inclusive) of intron | |
else: | |
anchor = _simple_g2c(prev_end - 1) | |
offset = gpos - prev_end + 1 | |
assert offset > 0 | |
assert self.check_intron(anchor, offset) | |
return CDSPosition(fmts['intron'] % (anchor, offset)) | |
assert False # function should return for every integer | |
# TODO verify that values of offset are sane | |
def check_intron(self, anchor, offset): | |
""" | |
Verify that CDS-relative intron position is valid with given exons. | |
Parameters | |
---------- | |
anchor : int | |
Intron anchor (closest CDS position) | |
offset : int | |
Intron offset (distance to anchor) | |
Returns | |
------- | |
bool | |
""" | |
for exon in self.exons.parts: | |
start = int(self.g2c(exon.start)) | |
if anchor == start: | |
if offset > 0: | |
raise CDSPositionError( | |
"Invalid intron: offset from exon start must be negative.") | |
return True | |
end = int(self.g2c(exon.end - 1)) | |
if anchor == end: | |
if offset < 0: | |
raise CDSPositionError( | |
"Invalid intron: offset from exon end must be positive.") | |
return True | |
raise CDSPositionError( | |
"Invalid intron: anchor must be start or end of an exon.") | |
def get_strand(self, gpos): | |
for exon in self.exons.parts: | |
if gpos in exon: | |
return exon.strand | |
raise ValueError("Provided gpos must be exon") | |
@pos_factory("CDS") | |
def c2g(self, cpos, dialect=None): | |
"""Convert from CDS to genomic coordinates | |
Parameters | |
---------- | |
cpos : int | |
CDS position | |
dialect : str | |
Coordinate dialect (GenBank or HGVS, default None) | |
Returns | |
------- | |
GenomePosition | |
""" | |
if cpos.pos_type == "pre-CDS": | |
return GenomePosition(self.exons.start + cpos.offset) | |
elif cpos.pos_type == "post-CDS": | |
return GenomePosition(self.exons.end - 1 + cpos.offset) | |
g_anchor = self.exon_list[cpos.anchor] | |
if cpos.pos_type == "exon": | |
strand = self.get_strand(g_anchor) | |
return GenomePosition(g_anchor, strand=strand) | |
elif cpos.pos_type == "intron": | |
offset = cpos.offset | |
if self.check_intron(cpos.anchor, offset): | |
return GenomePosition(g_anchor + offset) | |
assert False # all positions should be one of the 4 types | |
@pos_factory("CDS") | |
def c2p(self, cpos, dialect=None): | |
"""Convert from CDS to protein coordinates | |
Parameters | |
---------- | |
cpos : int | |
CDS position | |
dialect : str | |
Coordinate dialect (GenBank or HGVS, default None) | |
Returns | |
------- | |
ProteinPosition | |
""" | |
try: | |
cpos = int(cpos) | |
except TypeError: | |
raise CDSPositionError("'%s' does not correspond to a protein" | |
% repr(cpos)) | |
return ProteinPosition(int(cpos / 3.0)) | |
@pos_factory("Genome") | |
def g2p(self, gpos, dialect=None): | |
"""Convert integer from genomic to protein coordinates | |
Parameters | |
---------- | |
gpos : int | |
Genomic position | |
dialect : str | |
Coordinate dialect (GenBank or HGVS, default None) | |
Returns | |
------- | |
ProteinPosition | |
""" | |
return self.c2p(self.g2c(gpos)) | |
@pos_factory("Protein") | |
def p2c(self, ppos, dialect=None): | |
"""Convert integer from protein coordinate to CDS closed range | |
Parameters | |
---------- | |
ppos : int | |
Protein position | |
dialect : str | |
Coordinate dialect (GenBank or HGVS, default None) | |
Returns | |
------- | |
CDSPosition | |
""" | |
try: | |
ppos = int(ppos) | |
except TypeError: | |
return None | |
if ppos < 0: | |
raise ProteinPositionError("'%s' should not be negative") | |
# FIXME is CDS guaranteed to have len % 3 == 0? | |
first_base = (ppos) * 3 | |
last_base = first_base + 2 | |
if last_base > len(self.exons): | |
raise ProteinPositionError("'%s' is too large") | |
return (CDSPosition(first_base), CDSPosition(last_base)) | |
@pos_factory("Protein") | |
def p2g(self, ppos, dialect=None): | |
"""Convert integer from protein to genomic coordinates | |
Parameters | |
---------- | |
ppos : int | |
Protein position | |
dialect : str | |
Coordinate dialect (GenBank or HGVS, default None) | |
Returns | |
------- | |
GenomePosition | |
""" | |
return tuple(self.c2g(x) for x in self.p2c(ppos)) | |
if __name__ == '__main__': | |
# The following exons are from AB026906.1. | |
# test case: g.7872 -> c.274 -> p.92 | |
# N.B. These are python counting coordinates (0-based) | |
exons = [(5808, 5860), (6757, 6874), (7767, 7912), (13709, 13785)] | |
def test_list(g_range): | |
cm = CoordinateMapper(exons) | |
for g1 in g_range: | |
print g1, | |
c1 = cm.g2c(g1) | |
print c1, | |
p1 = cm.c2p(c1) | |
print p1, | |
if p1: | |
c2 = cm.p2c(p1)[0] | |
else: | |
c2 = c1 | |
print ' | ', c2, | |
g2 = cm.c2g(c2) | |
print g2 | |
print cm.g2p(7872) | |
print cm.p2g(92) | |
def test_simple(): | |
from SeqFeature import SeqFeature | |
location = sum([FeatureLocation(2, 4, +1), | |
FeatureLocation(8, 11, +1), | |
FeatureLocation(16, 18, +1)]) | |
simple_exons = SeqFeature(location, type="CDS") | |
cm = CoordinateMapper(simple_exons) | |
print cm.exons | |
print list(cm.exons) | |
print range(len(cm.exons)) | |
for i in range(len(cm.exons)): | |
print "%3s" % cm.c2g(i), | |
for i in xrange(20): | |
print "%3d" % i, | |
for i in xrange(20): | |
print "%3s" % cm.g2c(i), | |
r1 = (7870, 7871, 7872, 7873, 7874) | |
r2 = (5807, 5808, 5809, | |
6871, 6872, 6873, 6874, 6875, | |
7766, 7767, 7768, 7769, | |
13784, 13785, 13786) | |
test_list(r1) | |
#test_list(r2) | |
test_simple() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from copy import copy | |
import re | |
class InvalidPositionError(ValueError): | |
"Exception for bad coordinates" | |
class GenomePositionError(InvalidPositionError): | |
"Exception for bad genome coordinates" | |
class CDSPositionError(InvalidPositionError): | |
"Exception for bad CDS coordinates" | |
class ProteinPositionError(InvalidPositionError): | |
"Exception for bad protein coordinates" | |
class MapPosition(object): | |
"""Generic position for coordinate mapping.""" | |
def __init__(self, pos, index=None, **kwargs): | |
"""Init class from HGVS position. | |
Parameters | |
---------- | |
pos : int, str | |
Position to convert | |
index : int | |
Start of counting (e.g. 1 for GenBank or HGVS) | |
""" | |
self.index = index | |
if pos and self.index: | |
pos -= index | |
self.pos = pos | |
@classmethod | |
def from_dialect(cls, dialect, pos, strand=None): | |
"""Init class from given dialect. | |
Parameters | |
---------- | |
dialect : str | |
input dialect (HGVS or GenBank) | |
pos : int, str | |
dialect position to convert | |
strand : int | |
Strand of position (-1 or +1, default None) | |
Returns | |
------- | |
cls | |
""" | |
if dialect is None: | |
return cls(pos, strand=strand) | |
try: | |
from_func = getattr(cls, "from_" + dialect.lower()) | |
except AttributeError: | |
raise ValueError("Dialect '%s' not valid" % dialect) | |
return from_func(pos, strand) | |
@classmethod | |
def from_hgvs(cls, hgvs_pos, strand=None): | |
"""Init class from HGVS position. | |
Parameters | |
---------- | |
hgvs_pos : int, str | |
HGVS position to convert | |
strand : int | |
Strand of position (-1 or +1, default None) | |
Returns | |
------- | |
cls | |
""" | |
return cls(hgvs_pos, index=1, strand=strand, post_fmt="*%d") | |
@classmethod | |
def from_genbank(cls, gbk_pos, strand=None): | |
"""Init class from GenBank position. | |
Parameters | |
---------- | |
gbk_pos : int, str | |
GenBank position to convert | |
strand : int | |
Strand of position (-1 or +1, default None) | |
Returns | |
------- | |
cls | |
""" | |
return cls(gbk_pos, index=1, strand=strand) | |
def to(self, dialect): | |
"""Convert position to specified dialect. | |
Parameters | |
---------- | |
dialect : str | |
Output dialect (HGVS or GenBank) | |
""" | |
if dialect is None: | |
return self.to_str() | |
try: | |
to_func = getattr(self, "to_" + dialect.lower()) | |
except AttributeError: | |
raise ValueError("Dialect '%s' not valid" % dialect) | |
return to_func() | |
def to_hgvs(self): | |
"""Convert position to HGVS""" | |
if self.pos or self.pos == 0: | |
return self.pos + 1 | |
return None | |
def to_genbank(self): | |
"""Convert position to GenBank""" | |
if self.pos or self.pos == 0: | |
return self.pos + 1 | |
return None | |
def to_str(self): | |
"""Make string representation without conversion""" | |
return self.pos | |
def __str__(self): | |
"""String representation""" | |
return str(self.to_str()) | |
def __int__(self): | |
"""Integer representation""" | |
return self.pos | |
def __repr__(self): | |
"""Detailed representation for debugging""" | |
return "%s(%s)" % (self.__class__.__name__, self.to_str()) | |
class GenomePosition(MapPosition): | |
"""Genome position for coordinate mapping""" | |
def __init__(self, gpos, index=None, strand=None, **kwargs): | |
# FIXME if index is string, error may be raised | |
if gpos < (index or 0): | |
raise GenomePositionError("Genome position cannot be negative.") | |
# call superclass constructor | |
MapPosition.__init__(self, gpos, index) | |
self.strand = strand | |
def __eq__(self, other): | |
"""Compare equal to other GenomePosition with same pos | |
or integer equal to pos""" | |
if isinstance(other, int): | |
return self.pos == other | |
return isinstance(other, GenomePosition) and self.pos == other.pos | |
class ProteinPosition(MapPosition): | |
"""Protein position for coordinate mapping""" | |
def __init__(self, ppos, index=None, **kwargs): | |
# call superclass constructor | |
MapPosition.__init__(self, ppos, index) | |
def __eq__(self, other): | |
"""Compare equal to other ProteinPosition with same pos | |
or integer equal to pos""" | |
if isinstance(other, int): | |
return self.pos == other | |
return isinstance(other, ProteinPosition) and self.pos == other.pos | |
class CDSPosition(MapPosition): | |
"""CDS position for coordinate mapping""" | |
def __init__(self, cpos, index=None, | |
pre_fmt=None, post_fmt=None, **kwargs): | |
# Dispatch types and return anchor, offset | |
if isinstance(cpos, int): | |
anchor, offset = self.parse_int(cpos) | |
elif isinstance(cpos, str): | |
anchor, offset = self.parse_str(cpos, pre_fmt, post_fmt) | |
else: | |
raise CDSPositionError("'%s' is of unknown type" % cpos) | |
# Set instance anchor and offset | |
# call superclass constructor | |
MapPosition.__init__(self, anchor, index) | |
self._offset = offset | |
self.validate() | |
@classmethod | |
def from_anchor(cls, anchor=None, offset=None): | |
"""Init CDSPosition with anchor, offset pair. | |
Parameters | |
---------- | |
anchor : int | |
CDS anchor (coordinate of nearest exon position) | |
offset : int | |
Offset from nearest exon position | |
Returns | |
------- | |
CDSPosition | |
""" | |
if not anchor: | |
pos = cls("%+d" % offset) | |
elif anchor < 0: | |
raise CDSPositionError("Anchor cannot be negative.") | |
else: | |
pos = cls(anchor) | |
pos.offset = offset | |
return pos | |
@property | |
def offset(self): | |
return self._offset | |
@offset.setter | |
def offset(self, val): | |
"Validate new offset, then update" | |
self.validate(offset=val) | |
self._offset = val | |
@property | |
def anchor(self): | |
return self.pos | |
@anchor.setter | |
def anchor(self, val): | |
"Validate new anchor, then update pos" | |
self.validate(anchor=val) | |
self.pos = val | |
def validate(self, anchor="sentinel", offset="sentinel"): | |
"""Check whether anchor and offset yield a valid position. | |
Parameters | |
---------- | |
anchor : int | |
CDS anchor (coordinate of nearest exon position) | |
offset : int | |
Offset from nearest exon position | |
Returns | |
------- | |
bool | |
""" | |
if anchor == "sentinel": | |
anchor = self.anchor | |
if offset == "sentinel": | |
offset = self.offset | |
if offset == 0: | |
raise CDSPositionError( | |
"Offset may not be 0. For no offset, use None.") | |
if not anchor and anchor != 0 and not offset: | |
raise CDSPositionError( | |
"At least one of pos or offset must be defined") | |
if anchor and anchor < 0: | |
raise CDSPositionError("CDS anchor may not be negative.") | |
return True | |
@property | |
def pos_type(self): | |
"Type of CDS position, dynamically determined from values" | |
# inside CDS | |
if self.pos or self.pos == 0: | |
if not self.offset: | |
return "exon" | |
return "intron" | |
# outside CDS | |
elif self.offset > 0: | |
return "post-CDS" | |
return "pre-CDS" | |
assert False # all integers should return | |
@property | |
def sub_dict(self): | |
if self.pos_type == 'intron': | |
return {'pos': self.pos, 'offset': self.offset} | |
if self.pos_type == 'exon': | |
return {'pos': self.pos} | |
if self.pos_type == 'post-CDS' or self.pos_type == 'pre-CDS': | |
return {'offset': self.offset} | |
fmt_dict = { | |
'exon': "%d", | |
'intron': "%d%+d", | |
'post-CDS': "%+d", | |
'pre-CDS': "%+d", | |
} | |
@staticmethod | |
def _shift_index(pos_dict, idx): | |
"""Increment value of dict key 'pos' by given index. | |
Parameters | |
---------- | |
pos_dict : dict | |
Dictionary to search for 'pos' | |
idx : int | |
Index to add to pos_dict['pos'] | |
Returns | |
------- | |
dict | |
""" | |
if 'pos' in pos_dict and idx: | |
pos_dict['pos'] += idx | |
return pos_dict | |
def _make_str(self, val_dict=None, fmt_dict=None): | |
"""Retrieve format string and substitute values. | |
Parameters | |
---------- | |
val_dict : dict | |
Dictionary of values to substitute into string | |
fmt_dict : dict | |
Dictionary of format strings for each pos type | |
Returns | |
------- | |
str | |
""" | |
# set default dicts if parameter dicts are false | |
fmt_dict = fmt_dict or self.fmt_dict | |
val_dict = val_dict or self.sub_dict | |
return fmt_dict[self.pos_type] % tuple(val_dict.values()) | |
@staticmethod | |
def parse_int(cpos): | |
"""Parse int to anchor, offset pair | |
Parameters | |
---------- | |
cpos : int | |
Integer position to convert | |
Returns | |
------- | |
tuple | |
""" | |
# treat negative int as offset | |
if cpos < 0: | |
return (None, cpos) | |
# treat positive int as anchor | |
return (cpos, None) | |
@staticmethod | |
def parse_str(cpos, pre_fmt, post_fmt): | |
"""Parse string to anchor, offset pair | |
Parameters | |
---------- | |
cpos : str | |
String position to convert | |
pre_fmt : str | |
Format string for pre-CDS positions | |
post_fmt : str | |
Format string for post-CDS positions | |
Returns | |
------- | |
tuple | |
""" | |
delimiters = "\+\-" | |
if post_fmt and "*" in post_fmt: | |
delimiters += "\*" | |
# parenth causes split pattern to be kept | |
delim_rx = re.compile("([%s])" % delimiters) | |
parsed = delim_rx.split(cpos, 1) | |
if len(parsed) == 1: # may be int | |
return CDSPosition.parse_int(int(cpos)) | |
# 1 split is normally length 2 but delimiter is also kept | |
elif len(parsed) != 3: | |
raise CDSPositionError( | |
"String '%s' not parseable for this position." % cpos) | |
if parsed[0] == "": | |
anchor = None | |
else: | |
anchor = int(parsed[0]) | |
if post_fmt and parsed[1] == "*" == post_fmt[0]: | |
parsed[1] = "+" | |
offset = int("".join(parsed[1:])) | |
return (anchor, offset) | |
def to_hgvs(self): | |
"""Convert CDS position to HGVS""" | |
fmt_dict = copy(self.fmt_dict) | |
fmt_dict['post-CDS'] = "*%d" | |
sub_dict = self._shift_index(self.sub_dict, 1) | |
return self._make_str(sub_dict, fmt_dict) | |
def to_genbank(self): | |
"""Convert CDS position to GenBank""" | |
sub_dict = self._shift_index(self.sub_dict, 1) | |
return self._make_str(sub_dict) | |
def to_str(self): | |
"""Make string representation of CDS position""" | |
return self._make_str() | |
def __int__(self): | |
"""Integer representation of CDS exon, otherwise NotImplemented""" | |
if self.pos_type == "exon": | |
return MapPosition.__int__(self) | |
return NotImplemented | |
def __eq__(self, other): | |
"""Compare equal to other MapPosition with same pos and offset | |
or int if exon""" | |
if isinstance(other, int) and self.pos_type == "exon": | |
return self.pos == other | |
return isinstance(other, CDSPosition) and \ | |
self.pos == other.pos and \ | |
self.offset == other.offset | |
if __name__ == "__main__": | |
def print_pos(pos_obj): | |
print "object:", pos_obj | |
print "repr:", repr(pos_obj) | |
print "HGVS:", pos_obj.to_hgvs() | |
g = GenomePosition.from_hgvs(6) | |
print_pos(g) | |
test_g = GenomePosition(5) | |
#test_c = CDSPosition("6+1") | |
#test_c = CDSPosition.from_hgvs("6+1") | |
test_c = CDSPosition.from_hgvs("*1") | |
#test_c = CDSPosition(6) | |
#test_c = CDSPosition(-1) | |
print_pos(test_g) | |
print test_c.pos_type | |
print_pos(test_c) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from functools import wraps | |
import unittest | |
from CoordinateMapper import CoordinateMapper | |
from MapPositions import GenomePositionError | |
from MapPositions import ProteinPositionError | |
from MapPositions import CDSPosition, CDSPositionError | |
from SeqFeature import FeatureLocation, SeqFeature | |
from Bio.SeqRecord import SeqRecord | |
exons = [(5808, 5860), (6757, 6874), (7767, 7912), (13709, 13785)] | |
cmap = CoordinateMapper(exons) | |
g_exons = xrange(7868, 7875) # len 7 | |
c_exons = xrange(270, 279) # len 9 | |
p_exons = xrange(90, 93) # len 3 | |
p_exons_trip = [p for p in p_exons for n in range(3)] # len 9 | |
c_exon_prs = ((270, 272), (273, 275), (276, 278)) # len 3 | |
g_exon_prs = ((7868, 7870), (7871, 7873), (7874, 7876)) # len 3 | |
g_introns = {None: (5860, 5861, 6308, 6309, 6755, 6756), | |
'HGVS': (5861, 5862, 6309, 6310, 6756, 6757)} | |
c_introns = {None: ('51+1', '51+2', '51+449', '52-448', '52-2', '52-1'), | |
'HGVS': ('52+1', '52+2', '52+449', '53-448', '53-2', '53-1')} | |
c_intron_tups = ((51, 1), (51, 2), (51, 449), (52, -448), (52, -2), (52, -1)) | |
g_outside = {None: (0, 13785, 14000), 'HGVS': (1, 13786, 14001)} | |
c_outside = {None: ('-5808', '+1', '+216'), 'HGVS': ('-5808', '*1', '*216')} | |
c_outside_tups = ((None, -5808), (None, 1), (None, 216)) | |
def two_dialects(fn): | |
@wraps(fn) | |
def call_tests(self): | |
orig_dialect = self.dialect | |
for dialect in (None, 'HGVS'): | |
self.dialect = dialect | |
fn(self) | |
self.dialect = orig_dialect | |
return call_tests | |
class TestCDSPosition(unittest.TestCase): | |
"""Test that CDSPosition works properly""" | |
def setUp(self): | |
self.dialect = None | |
@two_dialects | |
def testGoodIntron(self): | |
"""CDSPosition should match good intron values""" | |
for c_args, c in zip(c_intron_tups, c_introns[self.dialect]): | |
actual = CDSPosition.from_anchor(*c_args).to(self.dialect) | |
self.assertEqual(actual, c) | |
@two_dialects | |
def testGoodOutside(self): | |
"""CDSPosition should match good outside-CDS values""" | |
for c_args, c in zip(c_outside_tups, c_outside[self.dialect]): | |
actual = CDSPosition.from_anchor(*c_args).to(self.dialect) | |
self.assertEqual(actual, c) | |
def testEqual(self): | |
"""CDSPosition should test equal with same args""" | |
for args in c_intron_tups: | |
CPos = CDSPosition.from_anchor | |
self.assertEqual(CPos(*args), CPos(*args)) | |
self.assertEqual(str(CPos(*args)), str(CPos(*args))) | |
#def testEqualDialects(self): | |
#for c_pos in c_outside: | |
#self.assertEqual(CDSPosition(c_pos), CDSPosition.from_hgvs(c_pos)) | |
#self.assertEqual(c_pos, CDSPosition.from_hgvs(c_pos).to_hgvs()) | |
def testBadAnchor(self): | |
"""CDSPosition should fail with negative CDS anchor""" | |
self.assertRaises(CDSPositionError, CDSPosition.from_anchor, -1, 1) | |
def testBadOffset(self): | |
"""CDSPosition should fail with zero offset""" | |
self.assertRaises(CDSPositionError, CDSPosition.from_anchor, None, 0) | |
def testValidate(self): | |
"""Setting CDSPosition anchor offset should fail for invalid cases""" | |
intron = CDSPosition("22+4") | |
self.assertRaises(CDSPositionError, setattr, intron, "offset", 0) | |
self.assertRaises(CDSPositionError, setattr, intron, "anchor", -5) | |
exon = CDSPosition("60") | |
self.assertRaises(CDSPositionError, setattr, exon, "anchor", None) | |
class TestCoordinateMapper(unittest.TestCase): | |
"""Test that CoordinateMapper works properly""" | |
def setUp(self): | |
self.sf = SeqFeature(sum((FeatureLocation(s, e) for s, e in exons)), | |
type="CDS") | |
def testListInit(self): | |
"""CoordinateMapper should take list of exon pairs""" | |
cm = CoordinateMapper(exons) | |
# FIXME CompoundLocation does not have __eq__ | |
self.assertEqual(str(cm.exons), str(self.sf.location)) | |
def testSeqRecordInit(self): | |
"""CoordinateMapper should use first CDS feature of SeqRecord""" | |
sr = SeqRecord("", features=[self.sf]) | |
cm = CoordinateMapper(sr) | |
self.assertEqual(str(cm.exons), str(self.sf.location)) | |
def testSeqFeatureInit(self): | |
"""CoordinateMapper should take a CDS SeqFeature""" | |
cm = CoordinateMapper(self.sf) | |
self.assertEqual(str(cm.exons), str(self.sf.location)) | |
def testEmptyInit(self): | |
"""CoordinateMapper should fail with no arguments""" | |
self.assertRaises(Exception, CoordinateMapper) | |
class Testg2c(unittest.TestCase): | |
"""Test success of good input and failure of bad input to g2c""" | |
def setUp(self): | |
self.dialect = None | |
# what it should do | |
# any integer should return | |
def testGoodExons(self): | |
"""g2c should work for exon positions""" | |
for arg, expected in zip(g_exons, c_exons): | |
self.assertEqual(cmap.g2c(arg), expected) | |
@two_dialects | |
def testGoodIntronsStr(self): | |
"""g2c should work for intron positions (string)""" | |
dia = self.dialect | |
for arg, expect in zip(g_introns[dia], c_introns[dia]): | |
actual = cmap.g2c(arg, dia).to(dia) | |
self.assertEqual(actual, expect) | |
@two_dialects | |
def testGoodIntrons(self): | |
"""g2c should work for intron positions (CDSPosition)""" | |
for arg, tup in zip(g_introns[self.dialect], c_intron_tups): | |
actual = cmap.g2c(arg, self.dialect) | |
expect = CDSPosition.from_anchor(*tup) | |
self.assertEqual(actual, expect) | |
self.assertEqual(str(actual), str(expect)) | |
@two_dialects | |
def testGoodOutsideStr(self): | |
"""g2c should work for outside positions (string)""" | |
dia = self.dialect | |
for arg, expected in zip(g_outside[dia], c_outside[dia]): | |
actual = cmap.g2c(arg, dia).to(dia) | |
self.assertEqual(actual, expected) | |
def testGoodOutside(self): | |
"""g2c should work for outside positions (CDSPosition)""" | |
for arg, tup in zip(g_outside[self.dialect], c_outside_tups): | |
actual = cmap.g2c(arg) | |
expect = CDSPosition.from_anchor(*tup) | |
self.assertEqual(actual, expect) | |
self.assertEqual(str(actual), str(expect)) | |
# what it shouldn't do | |
# should it handle non-exact positions? | |
@two_dialects | |
def testZeroArg(self): | |
"""g2c should work for 0 in no dialect and fail for 1-index""" | |
args = (0, self.dialect) | |
if self.dialect is None: | |
cmap.g2c(*args) | |
else: | |
self.assertRaises(GenomePositionError, cmap.g2c, *args) | |
@two_dialects | |
def testBadArg(self): | |
"""g2c should fail on string, float, None, or negative""" | |
bad = ("string", None, 3.14, -5) | |
for arg in bad: | |
self.assertRaises(Exception, cmap.g2c, arg) | |
#self.assertRaises(GenomePositionError, cmap.g2c, -5) | |
class Testc2p(unittest.TestCase): | |
"""Test success of good input and failure of bad input to c2p""" | |
# what it should do | |
# integer within length of exon should return correct protein | |
def testGoodExons(self): | |
"""c2p should work for exon positions""" | |
for arg, expect in zip(c_exons, p_exons_trip): | |
self.assertEqual(cmap.c2p(arg), expect) | |
# FIXME should CDSPosition return None or raise error? | |
def testBadNotProtein(self): | |
"""c2p should fail for CDSPosition or str""" | |
bad = ("string", CDSPosition.from_anchor(7, -5)) | |
for arg in bad: | |
self.assertRaises(ValueError, cmap.c2p, arg) | |
class Testp2c(unittest.TestCase): | |
"""Test success of good input and failure of bad input to p2c""" | |
# what it should do | |
# integer within length of protein should return correct range | |
def testGoodExons(self): | |
"""p2c should work for exon positions""" | |
#for arg, expect in p2c_exons.iteritems(): | |
for arg, expect in zip(p_exons, c_exon_prs): | |
self.assertEqual(cmap.p2c(arg), expect) | |
def testBadTooLarge(self): | |
"""p2c should fail for positions longer than the max length""" | |
self.assertRaises(ProteinPositionError, cmap.p2c, 500) | |
def testBadNegative(self): | |
"""p2c should fail for negative protein coordinate""" | |
self.assertRaises(ProteinPositionError, cmap.p2c, -50) | |
class Testc2g(unittest.TestCase): | |
"""Test success of good input and failure of bad input to c2g""" | |
def setUp(self): | |
self.dialect = None | |
# what it should do | |
# integer within length of exon should return correct value | |
def testGoodExons(self): | |
"""c2g should work for exon positions""" | |
for arg, expected in zip(c_exons, g_exons): | |
self.assertEqual(cmap.c2g(arg), expected) | |
# CDSPosition object should return correct value | |
@two_dialects | |
def testGoodIntrons(self): | |
"""c2g should work for introns (CDSPosition)""" | |
dia = self.dialect | |
for c_args, g in zip(c_intron_tups, g_introns[dia]): | |
actual = cmap.c2g(CDSPosition.from_anchor(*c_args), dia) | |
self.assertEqual(actual.to(dia), g) | |
# FIXME doesn't work with HGVS | |
#@two_dialects | |
def testGoodOutside(self): | |
"""c2g should work for outside (CDSPosition)""" | |
for args, expect in zip(c_outside_tups, g_outside[self.dialect]): | |
self.assertEqual(cmap.c2g(CDSPosition.from_anchor(*args)), | |
expect) | |
# simple string should return correct value | |
# \d*[+-]\d+ | |
@two_dialects | |
def testGoodIntronsStr(self): | |
"""c2g should work for intron positions (string)""" | |
dia = self.dialect | |
# c_str from dialect to *dialect* matches *dialect* g_str | |
for c_str, g in zip(c_introns[dia], g_introns[dia]): | |
self.assertEqual(cmap.c2g(c_str, dia).to(dia), g) | |
# c_str from dialect as *default* matches *default* g_str | |
for c_str, g in zip(c_introns[dia], g_introns[None]): | |
self.assertEqual(cmap.c2g(c_str, dia), g) | |
@two_dialects | |
def testGoodOutsideStr(self): | |
"""c2g should work for outside positions (string)""" | |
dia = self.dialect | |
for expected, arg in zip(g_outside[dia], c_outside[dia]): | |
self.assertEqual(cmap.c2g(arg, dia).to(dia), expected) | |
def testNegativeInt(self): | |
"""c2g should treat negative integer as pre-CDS""" | |
# XXX negative integers are treated as pre-CDS | |
self.assertEqual(cmap.c2g(-2), 5806) | |
# what it shouldn't do | |
# bad positions in form 123+4 where 123 is not end of an exon | |
# or where sign is wrong | |
@two_dialects | |
def testBadNotEnd(self): | |
"""c2g should fail on invalid introns""" | |
bad_introns = {None: ("160+5", # 160 is exonic | |
"51-6", # 51 is end | |
"52+10", # 52 is start | |
), 'HGVS': ("160+5", "52-6", "53+10")} | |
for arg in bad_introns[self.dialect]: | |
self.assertRaises(CDSPositionError, cmap.c2g, arg, self.dialect) | |
@two_dialects | |
def testBadString(self): | |
"""c2g should fail on arbitrary string""" | |
bad_vals = {None: ["xxx", "4-5'", "*10"], 'HGVS': ["xxx", "4-5"]} | |
for arg in bad_vals[self.dialect]: | |
self.assertRaises(ValueError, cmap.c2g, arg, self.dialect) | |
# integers outside length of exon should raise IndexError | |
def testBadTooLarge(self): | |
"""c2g should fail on values beyond the exon""" | |
num = 400 | |
assert num > len(cmap.exons) | |
self.assertRaises(IndexError, cmap.c2g, num) | |
self.assertRaises(Exception, cmap.c2g, "%d%d" % (num, -12)) | |
class Testp2g(unittest.TestCase): | |
def testGoodStr(self): | |
"""p2g should return expected ranges""" | |
for p_arg, gpos in zip(p_exons, g_exon_prs): | |
self.assertEqual(cmap.p2g(p_arg), gpos) | |
class Testg2p(unittest.TestCase): | |
def testGoodStr(self): | |
"""g2p should return correct position for exons""" | |
for g_arg, ppos in zip(g_exons, p_exons_trip): | |
self.assertEqual(cmap.g2p(g_arg), ppos) | |
class TestStrand(unittest.TestCase): | |
def setUp(self): | |
locations = FeatureLocation(24, 30, -1) + FeatureLocation(35, 40, +1) | |
feature = SeqFeature(locations, type="CDS") | |
self.cm = CoordinateMapper(feature) | |
self.g_exons = [29, 28, 27, 26, 25, 24, | |
35, 36, 37, 38, 39] | |
assert list(self.cm.exons) == self.g_exons | |
c_len = 11 | |
assert c_len == len(list(self.cm.exons)) | |
self.c_exons = xrange(c_len) | |
def testg2c(self): | |
"c2g and g2c should work for mixed-strand exons" | |
for c, g in zip(self.c_exons, self.g_exons): | |
self.assertEqual(c, self.cm.g2c(g)) | |
self.assertEqual(g, self.cm.c2g(c)) | |
if c < 6: | |
self.assertEqual(self.cm.c2g(c).strand, -1) | |
else: | |
self.assertEqual(self.cm.c2g(c).strand, +1) | |
if __name__ == "__main__": | |
#unittest.main() | |
suite0 = unittest.TestLoader().loadTestsFromTestCase(TestCDSPosition) | |
suite1 = unittest.TestLoader().loadTestsFromTestCase(TestCoordinateMapper) | |
suite2 = unittest.TestLoader().loadTestsFromTestCase(Testc2g) | |
suite3 = unittest.TestLoader().loadTestsFromTestCase(Testg2c) | |
suite4 = unittest.TestLoader().loadTestsFromTestCase(Testc2p) | |
suite5 = unittest.TestLoader().loadTestsFromTestCase(Testp2c) | |
suite6 = unittest.TestLoader().loadTestsFromTestCase(Testp2g) | |
suite7 = unittest.TestLoader().loadTestsFromTestCase(Testg2p) | |
suite8 = unittest.TestLoader().loadTestsFromTestCase(TestStrand) | |
all_suites = [ | |
suite0, | |
suite1, | |
suite2, | |
suite3, | |
suite4, | |
suite5, | |
suite6, | |
suite7, | |
suite8, | |
] | |
all = unittest.TestSuite(all_suites) | |
unittest.TextTestRunner(verbosity=3).run(all) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment