Skip to content

Instantly share code, notes, and snippets.

@SemiQuant
Created August 21, 2023 16:32
Show Gist options
  • Save SemiQuant/7686af449a4f6ba29ca9bd478a2ac3ff to your computer and use it in GitHub Desktop.
Save SemiQuant/7686af449a4f6ba29ca9bd478a2ac3ff to your computer and use it in GitHub Desktop.
Convert NCBI gff for bacteria to a usable format for bcftools csq
import gffutils
input_gff_file = "Mycobacterium_tuberculosis_H37Rv_gff_v4.gff"
output_gff_file = input_gff_file.replace('.gff', '_fixed.gff')
# Create a new GFF database
db = gffutils.create_db(input_gff_file, dbfn=':memory:', force=True, merge_strategy='replace')
# Create a new GFF writer
output_handle = open(output_gff_file, 'w')
output_handle.write("##gff-version 3\n")
# Process each feature
for feature in db.all_features():
if feature.featuretype == 'CDS':
attributes = dict(feature.attributes)
gene_identifier = str(attributes.get("locus_tag", attributes.get("gene", attributes.get("Name")))[0])
gene_name = attributes["Name"][0] if "Name" in attributes else ""
gene_attributes = {
'ID': ["gene:" + gene_identifier],
'gene_biotype': ['protein_coding'],
'Name': [gene_name],
'gene': [gene_identifier]
}
transcript_attributes = {
'ID': ["transcript:" + gene_identifier],
'Parent': [gene_identifier],
'gene_biotype': ['protein_coding'],
'Name': [gene_name],
'gene': [gene_identifier]
}
gene_feature = gffutils.Feature(
seqid=feature.seqid,
source=feature.source,
featuretype='gene',
start=feature.start,
end=feature.end,
strand=feature.strand,
attributes=gene_attributes
)
transcript_feature = gffutils.Feature(
seqid=feature.seqid,
source=feature.source,
featuretype='mRNA',
start=feature.start,
end=feature.end,
strand=feature.strand,
attributes=transcript_attributes
)
output_handle.write(str(gene_feature) + '\n')
output_handle.write(str(transcript_feature) + '\n')
output_handle.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment