fomightez · June 6, 2018 20:14
diff --git a/useful_BLAST_handling.py b/useful_BLAST_handling.py
 # see https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 for 
 # the main ones dealing with BLAST and dataframes, search `blast` to find

 # see two Jupyter notebooks in https://github.com/fomightez/blast-binder


 # SEE MOE FUNCTIONAL VERSION OF THIS FUNCTION AT https://github.com/fomightez/sequencework/tree/master/blast-utilities
 def BLAST_to_df(results_file):
    '''
    BLAST results to Pandas dataframe
    based on https://medium.com/@auguste.dutcher/turn-blast-results-into-a-presence-absence-matrix-cc44429c814
    with 
    http://library.open.oregonstate.edu/computationalbiology/chapter/command-line-blast/
    and 
    https://www.ncbi.nlm.nih.gov/books/NBK279684/ (Table C1:) 
    for guides to what the codes mean
    
    returns a dataframe
    '''
    import pandas as pd
    with open(results_file, 'r') as infile:
        # Here's where the BLAST command comes in handy
        col_names = ['qseqid', 'sseqid', 'stitle', 'pident', 'qcovs', 'length', 
        'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'qframe', 
        'sframe', 'frames', 'evalue', 'bitscore', 'qseq', 'sseq']
        return pd.read_csv(infile, sep='\t', header=None, names=col_names) 

    

 !blastn -query consensus.fa -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -task blastn -gapopen 0 -gapextend 2 -reward 1 -penalty -2 -word_size 14 -out consensus.x.chrmt.comp.txt
 results_file = 'consensus.x.chrmt.comp.txt'
 blast_df = BLAST_to_df(results_file)





 def order_low_to_high(row):
    '''
    Take start and end contens and puts them in order so lowest first and 
    highest second.

    To be used to help arrange BLAST results so match convention ENSEMBL uses 
    for reporting intervals of genes.
    Because came from BLAST results, `start` and `end` are `sstart` and `send`.
    '''
    if row.sstart == row.send:
        # shouldn't happen but putting here to cover it
        return row
    else:
        items = [row.sstart,row.send]
        row.sstart , row.send = min(items),max(items)
        return row

 def midpoint(items):
    '''
    takes a iterable of 
    '''
    return int((int(items[0])+int(items[1]))/2)


 list_to_add = []
 current_qseid = ""
 ids_in_previous_categories = 0
 for row in blast_df.itertuples():
    if row.qseqid == current_qseid:
        id_num_for_element = (row.Index+1) - ids_in_previous_categories
    else:
        current_qseid = row.qseqid
        ids_in_previous_categories = row.Index
        id_num_for_element = 1
    #list_to_add.append("{}-{}".format(row.qseqid,row.Index+1)) # gave unique number among all GC-clustes
    list_to_add.append("{}-{}".format(row.qseqid,id_num_for_element)) # gives unique number per type
    #print (row.Index)
 list_to_add
 df["sys_gene_id"] = list_to_add
 df["gene_symbol"] = None
 df = blast_df[["sys_gene_id","gene_symbol","sstart","send","sframe"]]
 # Make ORDER OF START AND END matchENSEMBL
 df = df.apply(order_low_to_high, axis=1)
 # rename columns to match previous conventions
 df = df.rename(columns = {
    'sstart':'start',
    'send':'end',
    'sframe':'strand',
 })
 df['midpoint'] = df[['start','end']].apply(midpoint, axis=1)# Easiest 
 # to be consistent with previous convention of column order.
 columns = ['sys_gene_id','gene_symbol',
 'start','end','midpoint','strand']
 df = df[columns]
 df.to_pickle("df_for_merging.pkl")
 df.head()



 # see also [A compilation of conversion tools for BED, SAM/BAM, psl, pslx, blast tabular and blast xml](http://bioinformatics.cvr.ac.uk/blog/a-compilation-of-conversion-tools-for-bed-sambam-psl-pslx-blast-tabular-and-blast-xml/)
	# see https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 for
	# the main ones dealing with BLAST and dataframes, search `blast` to find

	# see two Jupyter notebooks in https://github.com/fomightez/blast-binder


	# SEE MOE FUNCTIONAL VERSION OF THIS FUNCTION AT https://github.com/fomightez/sequencework/tree/master/blast-utilities
	def BLAST_to_df(results_file):
	'''
	BLAST results to Pandas dataframe
	based on https://medium.com/@auguste.dutcher/turn-blast-results-into-a-presence-absence-matrix-cc44429c814
	with
	http://library.open.oregonstate.edu/computationalbiology/chapter/command-line-blast/
	and
	https://www.ncbi.nlm.nih.gov/books/NBK279684/ (Table C1:)
	for guides to what the codes mean

	returns a dataframe
	'''
	import pandas as pd
	with open(results_file, 'r') as infile:
	# Here's where the BLAST command comes in handy
	col_names = ['qseqid', 'sseqid', 'stitle', 'pident', 'qcovs', 'length',
	'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'qframe',
	'sframe', 'frames', 'evalue', 'bitscore', 'qseq', 'sseq']
	return pd.read_csv(infile, sep='\t', header=None, names=col_names)



	!blastn -query consensus.fa -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -task blastn -gapopen 0 -gapextend 2 -reward 1 -penalty -2 -word_size 14 -out consensus.x.chrmt.comp.txt
	results_file = 'consensus.x.chrmt.comp.txt'
	blast_df = BLAST_to_df(results_file)





	def order_low_to_high(row):
	'''
	Take start and end contens and puts them in order so lowest first and
	highest second.

	To be used to help arrange BLAST results so match convention ENSEMBL uses
	for reporting intervals of genes.
	Because came from BLAST results, `start` and `end` are `sstart` and `send`.
	'''
	if row.sstart == row.send:
	# shouldn't happen but putting here to cover it
	return row
	else:
	items = [row.sstart,row.send]
	row.sstart , row.send = min(items),max(items)
	return row

	def midpoint(items):
	'''
	takes a iterable of
	'''
	return int((int(items[0])+int(items[1]))/2)


	list_to_add = []
	current_qseid = ""
	ids_in_previous_categories = 0
	for row in blast_df.itertuples():
	if row.qseqid == current_qseid:
	id_num_for_element = (row.Index+1) - ids_in_previous_categories
	else:
	current_qseid = row.qseqid
	ids_in_previous_categories = row.Index
	id_num_for_element = 1
	#list_to_add.append("{}-{}".format(row.qseqid,row.Index+1)) # gave unique number among all GC-clustes
	list_to_add.append("{}-{}".format(row.qseqid,id_num_for_element)) # gives unique number per type
	#print (row.Index)
	list_to_add
	df["sys_gene_id"] = list_to_add
	df["gene_symbol"] = None
	df = blast_df[["sys_gene_id","gene_symbol","sstart","send","sframe"]]
	# Make ORDER OF START AND END matchENSEMBL
	df = df.apply(order_low_to_high, axis=1)
	# rename columns to match previous conventions
	df = df.rename(columns = {
	'sstart':'start',
	'send':'end',
	'sframe':'strand',
	})
	df['midpoint'] = df[['start','end']].apply(midpoint, axis=1)# Easiest
	# to be consistent with previous convention of column order.
	columns = ['sys_gene_id','gene_symbol',
	'start','end','midpoint','strand']
	df = df[columns]
	df.to_pickle("df_for_merging.pkl")
	df.head()



	# see also [A compilation of conversion tools for BED, SAM/BAM, psl, pslx, blast tabular and blast xml](http://bioinformatics.cvr.ac.uk/blog/a-compilation-of-conversion-tools-for-bed-sambam-psl-pslx-blast-tabular-and-blast-xml/)