Last active
March 3, 2020 20:36
-
-
Save malonge/5d61e0c45a56f406bb37fc099bf4bf84 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import gzip | |
import argparse | |
def main(): | |
message = """ | |
Summarize the rate of edits (insertion, deletion, mismatch) from minimap2 alignments of a query to a target. | |
For each alignment, the rate is defined as the number of edits divided by the alignment length | |
with respect to the query. | |
""" | |
parser = argparse.ArgumentParser(description=message) | |
parser.add_argument("paf_file", metavar="<alns.paf(.gz)>", type=str, help="PAF file (difference string (minimap2 --cs) required).") | |
parser.add_argument("--normalize", action='store_true', default=False, help="Divide the number of edits by the query alignment length + gaps.") | |
# Get the command line arguments | |
args = parser.parse_args() | |
paf_file = args.paf_file | |
normalize = args.normalize | |
re_cs = re.compile(r'([:=*+-])(\d+|[A-Za-z]+)') | |
del_sum = 0 | |
ins_sum = 0 | |
mm_sum = 0 | |
m_sum = 0 | |
denom = 0 | |
# Iterate through the PAF file | |
if paf_file[-3:] == ".gz": | |
f = gzip.open(paf_file) | |
else: | |
f = open(paf_file, 'r') | |
for line in f: | |
if not isinstance(line, str): | |
l = line.decode("utf-8").rstrip().split('\t') | |
else: | |
l = line.rstrip().split('\t') | |
cs = None | |
# Get the info we need | |
q_len = int(l[3]) - int(l[2]) | |
# Get the difference string | |
for i in l[11:]: | |
if i.startswith("cs:Z"): | |
cs = i[5:] | |
if cs is None: | |
raise ValueError("The difference stirng (--cs) is required") | |
# Parse the difference string | |
del_line_sum = 0 | |
for m in re.findall(re_cs, cs): | |
if m[0] == '*': | |
mm_sum += 1 | |
elif m[0] == "+": | |
ins_sum += len(m[1]) | |
elif m[0] == "-": | |
del_sum += len(m[1]) | |
del_line_sum += len(m[1]) | |
else: | |
m_sum += int(m[1]) | |
if normalize: | |
denom += q_len | |
denom += del_line_sum | |
else: | |
denom += q_len | |
f.close() | |
print("The insertion rate is {:.3f}%".format((ins_sum/denom)*100)) | |
print("The deletion rate is {:.3f}%".format((del_sum / denom) * 100)) | |
print("The mismatch rate is {:.3f}%".format((mm_sum / denom) * 100)) | |
print("The match rate is {:.3f}%".format((m_sum / denom) * 100)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment