Skip to content

Instantly share code, notes, and snippets.

@fomightez
Last active June 6, 2018 20:14
Show Gist options
  • Save fomightez/baf668acd4c51586deed2a2c89fcac67 to your computer and use it in GitHub Desktop.
Save fomightez/baf668acd4c51586deed2a2c89fcac67 to your computer and use it in GitHub Desktop.
snippets for dealing with BLAST
# see https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 for
# the main ones dealing with BLAST and dataframes, search `blast` to find
# see two Jupyter notebooks in https://github.com/fomightez/blast-binder
# SEE MOE FUNCTIONAL VERSION OF THIS FUNCTION AT https://github.com/fomightez/sequencework/tree/master/blast-utilities
def BLAST_to_df(results_file):
'''
BLAST results to Pandas dataframe
based on https://medium.com/@auguste.dutcher/turn-blast-results-into-a-presence-absence-matrix-cc44429c814
with
http://library.open.oregonstate.edu/computationalbiology/chapter/command-line-blast/
and
https://www.ncbi.nlm.nih.gov/books/NBK279684/ (Table C1:)
for guides to what the codes mean
returns a dataframe
'''
import pandas as pd
with open(results_file, 'r') as infile:
# Here's where the BLAST command comes in handy
col_names = ['qseqid', 'sseqid', 'stitle', 'pident', 'qcovs', 'length',
'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'qframe',
'sframe', 'frames', 'evalue', 'bitscore', 'qseq', 'sseq']
return pd.read_csv(infile, sep='\t', header=None, names=col_names)
!blastn -query consensus.fa -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -task blastn -gapopen 0 -gapextend 2 -reward 1 -penalty -2 -word_size 14 -out consensus.x.chrmt.comp.txt
results_file = 'consensus.x.chrmt.comp.txt'
blast_df = BLAST_to_df(results_file)
def order_low_to_high(row):
'''
Take start and end contens and puts them in order so lowest first and
highest second.
To be used to help arrange BLAST results so match convention ENSEMBL uses
for reporting intervals of genes.
Because came from BLAST results, `start` and `end` are `sstart` and `send`.
'''
if row.sstart == row.send:
# shouldn't happen but putting here to cover it
return row
else:
items = [row.sstart,row.send]
row.sstart , row.send = min(items),max(items)
return row
def midpoint(items):
'''
takes a iterable of
'''
return int((int(items[0])+int(items[1]))/2)
list_to_add = []
current_qseid = ""
ids_in_previous_categories = 0
for row in blast_df.itertuples():
if row.qseqid == current_qseid:
id_num_for_element = (row.Index+1) - ids_in_previous_categories
else:
current_qseid = row.qseqid
ids_in_previous_categories = row.Index
id_num_for_element = 1
#list_to_add.append("{}-{}".format(row.qseqid,row.Index+1)) # gave unique number among all GC-clustes
list_to_add.append("{}-{}".format(row.qseqid,id_num_for_element)) # gives unique number per type
#print (row.Index)
list_to_add
df["sys_gene_id"] = list_to_add
df["gene_symbol"] = None
df = blast_df[["sys_gene_id","gene_symbol","sstart","send","sframe"]]
# Make ORDER OF START AND END matchENSEMBL
df = df.apply(order_low_to_high, axis=1)
# rename columns to match previous conventions
df = df.rename(columns = {
'sstart':'start',
'send':'end',
'sframe':'strand',
})
df['midpoint'] = df[['start','end']].apply(midpoint, axis=1)# Easiest
# to be consistent with previous convention of column order.
columns = ['sys_gene_id','gene_symbol',
'start','end','midpoint','strand']
df = df[columns]
df.to_pickle("df_for_merging.pkl")
df.head()
# see also [A compilation of conversion tools for BED, SAM/BAM, psl, pslx, blast tabular and blast xml](http://bioinformatics.cvr.ac.uk/blog/a-compilation-of-conversion-tools-for-bed-sambam-psl-pslx-blast-tabular-and-blast-xml/)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment