chilampoon · December 8, 2022 04:51 · chilampoon · Feb 9, 2023
diff --git a/fix_t2t_gff.py b/fix_t2t_gff.py
 #!/usr/bin/env python

 # fix t2t gff3 error
 # now the problem is ids for some start&stop condons/exons are not unique
 # 11/20/2022

 import sys,gzip
 from collections import defaultdict

 def sep_info(info):
    items = info.split(';')
    items_tuple = [(i.split('=')[0], i.split('=')[1]) for i in items]
    return items_tuple

 def comb_info(items):
    res = [f'{i[0]}={i[1]}' for i in items]
    return ';'.join(res)

 def update_id(r, info, count, replace=False):
    (idx, sid) = [(i, j) for i,j in enumerate(info) if j[0] == 'ID'][0]
    if replace:
        tmp = sid[1].split(':')
        new_id = f'{":".join(tmp[:-1])}:{count}'
    else:
        new_id = f'{sid[1]}:{count}'
    info[idx] = (sid[0], new_id)
    r[8] = comb_info(info)
    return r

 def fix_id(gff3, out):
    start_codon_cnt = defaultdict(lambda:0)
    stop_codon_cnt = defaultdict(lambda:0)
    liftoff_id_cnt = defaultdict(lambda:0)

    with open(gff3, 'r') as f, gzip.open(out, 'wb') as out:
        for row in f:
            if row.startswith('#'): 
                out.write(row.encode())
                continue

            r = row.rstrip().split('\t')
            if r[2] == 'start_codon':
                info = sep_info(r[8])
                tx_id = [i[1] for i in info if i[0] == 'transcript_id'][0]
                new_r = update_id(r, info, start_codon_cnt[tx_id])
                row = '\t'.join(new_r) + '\n'
                start_codon_cnt[tx_id] += 1
            elif r[2] == 'stop_codon':
                info = sep_info(r[8])
                tx_id = [i[1] for i in info if i[0] == 'transcript_id'][0]
                new_r = update_id(r, info, stop_codon_cnt[tx_id])
                row = '\t'.join(new_r) + '\n'
                stop_codon_cnt[tx_id] += 1

            if r[1] == 'Liftoff' and r[2] not in ['gene', 'transcript']:
                info = sep_info(r[8])
                old_id = [i[1] for i in info if i[0] == 'ID'][0]
                tmp = old_id.split(':')
                if len(tmp) == 2:
                    id_num = 0
                    this_id = old_id
                    repl = False
                elif len(tmp) == 3:
                    id_num = int(tmp[-1])
                    this_id = ':'.join(tmp[:-1])
                    repl = True
                else:
                    print(f'check {old_id}')
                if id_num != liftoff_id_cnt[this_id]:
                    new_r = update_id(r, info, liftoff_id_cnt[this_id], replace=repl)
                    row = '\t'.join(new_r) + '\n'
                liftoff_id_cnt[this_id] += 1

            out.write(row.encode())


 gff3_file = sys.argv[1]
 out_file = sys.argv[2]
 fix_id(gff3_file, out_file)

 # run: python fix_gff3.py chm13.draft_v2.0.gene_annotation.gff3 chm13.draft_v2.0.gene_annotation.fixed.gff3.gz
	#!/usr/bin/env python

	# fix t2t gff3 error
	# now the problem is ids for some start&stop condons/exons are not unique
	# 11/20/2022

	import sys,gzip
	from collections import defaultdict

	def sep_info(info):
	items = info.split(';')
	items_tuple = [(i.split('=')[0], i.split('=')[1]) for i in items]
	return items_tuple

	def comb_info(items):
	res = [f'{i[0]}={i[1]}' for i in items]
	return ';'.join(res)

	def update_id(r, info, count, replace=False):
	(idx, sid) = [(i, j) for i,j in enumerate(info) if j[0] == 'ID'][0]
	if replace:
	tmp = sid[1].split(':')
	new_id = f'{":".join(tmp[:-1])}:{count}'
	else:
	new_id = f'{sid[1]}:{count}'
	info[idx] = (sid[0], new_id)
	r[8] = comb_info(info)
	return r

	def fix_id(gff3, out):
	start_codon_cnt = defaultdict(lambda:0)
	stop_codon_cnt = defaultdict(lambda:0)
	liftoff_id_cnt = defaultdict(lambda:0)

	with open(gff3, 'r') as f, gzip.open(out, 'wb') as out:
	for row in f:
	if row.startswith('#'):
	out.write(row.encode())
	continue

	r = row.rstrip().split('\t')
	if r[2] == 'start_codon':
	info = sep_info(r[8])
	tx_id = [i[1] for i in info if i[0] == 'transcript_id'][0]
	new_r = update_id(r, info, start_codon_cnt[tx_id])
	row = '\t'.join(new_r) + '\n'
	start_codon_cnt[tx_id] += 1
	elif r[2] == 'stop_codon':
	info = sep_info(r[8])
	tx_id = [i[1] for i in info if i[0] == 'transcript_id'][0]
	new_r = update_id(r, info, stop_codon_cnt[tx_id])
	row = '\t'.join(new_r) + '\n'
	stop_codon_cnt[tx_id] += 1

	if r[1] == 'Liftoff' and r[2] not in ['gene', 'transcript']:
	info = sep_info(r[8])
	old_id = [i[1] for i in info if i[0] == 'ID'][0]
	tmp = old_id.split(':')
	if len(tmp) == 2:
	id_num = 0
	this_id = old_id
	repl = False
	elif len(tmp) == 3:
	id_num = int(tmp[-1])
	this_id = ':'.join(tmp[:-1])
	repl = True
	else:
	print(f'check {old_id}')
	if id_num != liftoff_id_cnt[this_id]:
	new_r = update_id(r, info, liftoff_id_cnt[this_id], replace=repl)
	row = '\t'.join(new_r) + '\n'
	liftoff_id_cnt[this_id] += 1

	out.write(row.encode())


	gff3_file = sys.argv[1]
	out_file = sys.argv[2]
	fix_id(gff3_file, out_file)

	# run: python fix_gff3.py chm13.draft_v2.0.gene_annotation.gff3 chm13.draft_v2.0.gene_annotation.fixed.gff3.gz