Created
July 16, 2020 16:51
-
-
Save wflynny/a4c1bde3ac60bd5f3c88976a55035d27 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i", "--infile", required=True) | |
parser.add_argument("-o", "--outfile", required=True) | |
args = parser.parse_args() | |
gene_matcher = re.compile('\tgene\t.*gene_id (".*?");.*Name (".*?");') | |
parent_matcher = re.compile('gene_id (".*?");.*Parent (".*?");') | |
new_line_fmt = '{} {} {};\n' | |
# Stores (gene_id -> Name) | |
mapper = {} | |
with open(args.infile, "r") as fin, open(args.outfile, "w") as fout: | |
for k, line in enumerate(fin): | |
if line.startswith("#"): continue | |
entry = line.rstrip('\n') | |
gmatch = gene_matcher.search(entry) | |
pmatch = parent_matcher.search(entry) | |
if gmatch: | |
print("gene match") | |
gid, gname = gmatch.groups() | |
mapper[gid] = gname | |
elif pmatch: | |
gid, parent = pmatch.groups() | |
gname = mapper.get(gid, "") | |
else: | |
print(f"found wrong thing on line: {k}") | |
continue | |
fout.write(new_line_fmt.format(entry, "gene_name", gname)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment