Created
March 4, 2022 11:19
-
-
Save avrilcoghlan/dfcf809a87b6b154f22bfd41e53f8557 to your computer and use it in GitHub Desktop.
Python script to just take the top ChEMBL hit for each query gene, and any hits with E-values within 1e+5 of it. Also, only take hits of E-value <= 1e-10:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from collections import defaultdict | |
import FiftyHG_Chembl | |
#====================================================================# | |
def main(): | |
# find the blast output files: | |
blastoutput = defaultdict() | |
mydir = os.getcwd() # the current directory | |
myfiles = os.listdir(mydir) | |
for myfile in myfiles: | |
if myfile.endswith('.txt') and 'chembl' not in myfile: # eg. schistosoma_mansoni.txt. | |
# find the species name: | |
temp = myfile.split('.txt') | |
species = temp[0] # eg. schistosoma_mansoni | |
# parse and format this blast output file, to just take the top blast match of a helminth gene, considering | |
# all the splice-forms of a helminth gene: | |
myfile = os.path.join(mydir, myfile) | |
output_file = '%s2' % myfile | |
print('Making file',output_file) | |
if not os.path.exists(output_file): | |
FiftyHG_Chembl.reformat_blast_output_besthitonly(myfile,species,output_file,1e-10) | |
print("FINISHED\n") | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment