Skip to content

Instantly share code, notes, and snippets.

@mapmeld
Created June 4, 2024 23:33
Show Gist options
  • Save mapmeld/85e8647601b019c5d3b57d3f62454009 to your computer and use it in GitHub Desktop.
Save mapmeld/85e8647601b019c5d3b57d3f62454009 to your computer and use it in GitHub Desktop.
gff-peanut-butter.py
# retrieve contig
searchables = ['JAWIZW010000099.1', 'JAWIZW010000077.1']
contigs = []
with open('./hxntes/ncbi_dataset/data/GCA_034703905.1/GCA_034703905.1_C.berlandieri_1.0_genomic.fna', 'r') as contigfile:
inline = False
sequence = ""
lastSection = ""
for line in contigfile:
if '>' in line:
mySection = line[1:line.index(' ')]
if inline:
if '>' in line:
contigs.append({
"sequence": sequence,
"id": lastSection,
})
lastSection = mySection
sequence = ""
inline = False
else:
sequence += line.strip()
if mySection in searchables:
inline = True
lastSection = mySection
# print(searchables)
# get name and header from annotation
def parseHeader(line):
#>NC_085145.1:c1275-214 psbA [organism=Chenopodium berlandieri] [GeneID=87690347] [chromosome=Plastid: chloroplast]
name = line[line.index(' '):line.index('[')].strip()
id = line[line.index('GeneID='):]
id = id[:id.index(']')]
return name, id
# retrieve genes
annotations = []
with open('./genes/ncbi_dataset/data/gene.fna', 'r') as genefile:
inline = False
seek = ""
name = ""
id = ""
for line in genefile:
if inline:
if '>' in line:
annotations.append({
"seek": seek,
"name": name,
"id": id,
})
name, id = parseHeader(line)
seek = ""
else:
seek += line.strip()
if '>' in line:
inline = True
name, id = parseHeader(line)
seek = ""
seen = []
with open('./test.gff3', 'w') as gffout:
gffout.write('##gff-version 3\n')
for contig in contigs:
sequence = contig["sequence"]
for annotation in annotations:
if annotation["seek"][:20] not in sequence:
continue
if annotation["name"] in seen:
if sequence.index(annotation["seek"][:20]) != sequence.rindex(annotation["seek"][:20]):
start = sequence.rindex(annotation["seek"][:20]) + 1
else:
continue
else:
start = sequence.index(annotation["seek"][:20]) + 1
end = start + len(annotation["seek"]) - 1
seen.append(annotation["name"])
dir = '+'
if annotation["name"] in ['trnI-CAU', 'psbA']:
dir = '-'
gffout.write('\t'.join([
contig["id"],
'ncbi.nlm.nih.gov',
'Gene',
str(start),
str(end),
'100.0',
dir,
'0',
f'name={annotation["name"]};{annotation["id"]}'
]) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment