Created
August 21, 2023 16:32
-
-
Save SemiQuant/7686af449a4f6ba29ca9bd478a2ac3ff to your computer and use it in GitHub Desktop.
Convert NCBI gff for bacteria to a usable format for bcftools csq
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gffutils | |
input_gff_file = "Mycobacterium_tuberculosis_H37Rv_gff_v4.gff" | |
output_gff_file = input_gff_file.replace('.gff', '_fixed.gff') | |
# Create a new GFF database | |
db = gffutils.create_db(input_gff_file, dbfn=':memory:', force=True, merge_strategy='replace') | |
# Create a new GFF writer | |
output_handle = open(output_gff_file, 'w') | |
output_handle.write("##gff-version 3\n") | |
# Process each feature | |
for feature in db.all_features(): | |
if feature.featuretype == 'CDS': | |
attributes = dict(feature.attributes) | |
gene_identifier = str(attributes.get("locus_tag", attributes.get("gene", attributes.get("Name")))[0]) | |
gene_name = attributes["Name"][0] if "Name" in attributes else "" | |
gene_attributes = { | |
'ID': ["gene:" + gene_identifier], | |
'gene_biotype': ['protein_coding'], | |
'Name': [gene_name], | |
'gene': [gene_identifier] | |
} | |
transcript_attributes = { | |
'ID': ["transcript:" + gene_identifier], | |
'Parent': [gene_identifier], | |
'gene_biotype': ['protein_coding'], | |
'Name': [gene_name], | |
'gene': [gene_identifier] | |
} | |
gene_feature = gffutils.Feature( | |
seqid=feature.seqid, | |
source=feature.source, | |
featuretype='gene', | |
start=feature.start, | |
end=feature.end, | |
strand=feature.strand, | |
attributes=gene_attributes | |
) | |
transcript_feature = gffutils.Feature( | |
seqid=feature.seqid, | |
source=feature.source, | |
featuretype='mRNA', | |
start=feature.start, | |
end=feature.end, | |
strand=feature.strand, | |
attributes=transcript_attributes | |
) | |
output_handle.write(str(gene_feature) + '\n') | |
output_handle.write(str(transcript_feature) + '\n') | |
output_handle.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment