mapmeld · June 4, 2024 23:33
diff --git a/gff-peanut-butter.py b/gff-peanut-butter.py

 # retrieve contig
 searchables = ['JAWIZW010000099.1', 'JAWIZW010000077.1']
 contigs = []
 with open('./hxntes/ncbi_dataset/data/GCA_034703905.1/GCA_034703905.1_C.berlandieri_1.0_genomic.fna', 'r') as contigfile:
    inline = False
    sequence = ""
    lastSection = ""
    for line in contigfile:
        if '>' in line:
            mySection = line[1:line.index(' ')]
        if inline:
            if '>' in line:
                contigs.append({
                    "sequence": sequence,
                    "id": lastSection,
                })
                lastSection = mySection
                sequence = ""
                inline = False
            else:
                sequence += line.strip()
        if mySection in searchables:
            inline = True
            lastSection = mySection
 # print(searchables)

 # get name and header from annotation
 def parseHeader(line):
    #>NC_085145.1:c1275-214 psbA [organism=Chenopodium berlandieri] [GeneID=87690347] [chromosome=Plastid: chloroplast]
    name = line[line.index(' '):line.index('[')].strip()
    id = line[line.index('GeneID='):]
    id = id[:id.index(']')]
    return name, id

 # retrieve genes
 annotations = []
 with open('./genes/ncbi_dataset/data/gene.fna', 'r') as genefile:
    inline = False
    seek = ""
    name = ""
    id = ""
    for line in genefile:
        if inline:
            if '>' in line:
                annotations.append({
                    "seek": seek,
                    "name": name,
                    "id": id,
                })
                name, id = parseHeader(line)
                seek = ""
            else:
                seek += line.strip()
        if '>' in line:
            inline = True
            name, id = parseHeader(line)
            seek = ""

 seen = []
 with open('./test.gff3', 'w') as gffout:
    gffout.write('##gff-version 3\n')
    for contig in contigs:
        sequence = contig["sequence"]
        for annotation in annotations:
            if annotation["seek"][:20] not in sequence:
                continue
            if annotation["name"] in seen:
                if sequence.index(annotation["seek"][:20]) != sequence.rindex(annotation["seek"][:20]):
                    start = sequence.rindex(annotation["seek"][:20]) + 1
                else:
                    continue
            else:
                start = sequence.index(annotation["seek"][:20]) + 1
            end = start + len(annotation["seek"]) - 1
            seen.append(annotation["name"])

            dir = '+'
            if annotation["name"] in ['trnI-CAU', 'psbA']:
                dir = '-'

            gffout.write('\t'.join([
                contig["id"],
                'ncbi.nlm.nih.gov',
                'Gene',
                str(start),
                str(end),
                '100.0',
                dir,
                '0',
                f'name={annotation["name"]};{annotation["id"]}'
            ]) + '\n')

	# retrieve contig
	searchables = ['JAWIZW010000099.1', 'JAWIZW010000077.1']
	contigs = []
	with open('./hxntes/ncbi_dataset/data/GCA_034703905.1/GCA_034703905.1_C.berlandieri_1.0_genomic.fna', 'r') as contigfile:
	inline = False
	sequence = ""
	lastSection = ""
	for line in contigfile:
	if '>' in line:
	mySection = line[1:line.index(' ')]
	if inline:
	if '>' in line:
	contigs.append({
	"sequence": sequence,
	"id": lastSection,
	})
	lastSection = mySection
	sequence = ""
	inline = False
	else:
	sequence += line.strip()
	if mySection in searchables:
	inline = True
	lastSection = mySection
	# print(searchables)

	# get name and header from annotation
	def parseHeader(line):
	#>NC_085145.1:c1275-214 psbA [organism=Chenopodium berlandieri] [GeneID=87690347] [chromosome=Plastid: chloroplast]
	name = line[line.index(' '):line.index('[')].strip()
	id = line[line.index('GeneID='):]
	id = id[:id.index(']')]
	return name, id

	# retrieve genes
	annotations = []
	with open('./genes/ncbi_dataset/data/gene.fna', 'r') as genefile:
	inline = False
	seek = ""
	name = ""
	id = ""
	for line in genefile:
	if inline:
	if '>' in line:
	annotations.append({
	"seek": seek,
	"name": name,
	"id": id,
	})
	name, id = parseHeader(line)
	seek = ""
	else:
	seek += line.strip()
	if '>' in line:
	inline = True
	name, id = parseHeader(line)
	seek = ""

	seen = []
	with open('./test.gff3', 'w') as gffout:
	gffout.write('##gff-version 3\n')
	for contig in contigs:
	sequence = contig["sequence"]
	for annotation in annotations:
	if annotation["seek"][:20] not in sequence:
	continue
	if annotation["name"] in seen:
	if sequence.index(annotation["seek"][:20]) != sequence.rindex(annotation["seek"][:20]):
	start = sequence.rindex(annotation["seek"][:20]) + 1
	else:
	continue
	else:
	start = sequence.index(annotation["seek"][:20]) + 1
	end = start + len(annotation["seek"]) - 1
	seen.append(annotation["name"])

	dir = '+'
	if annotation["name"] in ['trnI-CAU', 'psbA']:
	dir = '-'

	gffout.write('\t'.join([
	contig["id"],
	'ncbi.nlm.nih.gov',
	'Gene',
	str(start),
	str(end),
	'100.0',
	dir,
	'0',
	f'name={annotation["name"]};{annotation["id"]}'
	]) + '\n')