Last active
June 6, 2018 20:14
-
-
Save fomightez/baf668acd4c51586deed2a2c89fcac67 to your computer and use it in GitHub Desktop.
snippets for dealing with BLAST
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # see https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 for | |
| # the main ones dealing with BLAST and dataframes, search `blast` to find | |
| # see two Jupyter notebooks in https://github.com/fomightez/blast-binder | |
| # SEE MOE FUNCTIONAL VERSION OF THIS FUNCTION AT https://github.com/fomightez/sequencework/tree/master/blast-utilities | |
| def BLAST_to_df(results_file): | |
| ''' | |
| BLAST results to Pandas dataframe | |
| based on https://medium.com/@auguste.dutcher/turn-blast-results-into-a-presence-absence-matrix-cc44429c814 | |
| with | |
| http://library.open.oregonstate.edu/computationalbiology/chapter/command-line-blast/ | |
| and | |
| https://www.ncbi.nlm.nih.gov/books/NBK279684/ (Table C1:) | |
| for guides to what the codes mean | |
| returns a dataframe | |
| ''' | |
| import pandas as pd | |
| with open(results_file, 'r') as infile: | |
| # Here's where the BLAST command comes in handy | |
| col_names = ['qseqid', 'sseqid', 'stitle', 'pident', 'qcovs', 'length', | |
| 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'qframe', | |
| 'sframe', 'frames', 'evalue', 'bitscore', 'qseq', 'sseq'] | |
| return pd.read_csv(infile, sep='\t', header=None, names=col_names) | |
| !blastn -query consensus.fa -db chrmt.fsa -outfmt "6 qseqid sseqid stitle pident qcovs length mismatch gapopen qstart qend sstart send qframe sframe frames evalue bitscore qseq sseq" -task blastn -gapopen 0 -gapextend 2 -reward 1 -penalty -2 -word_size 14 -out consensus.x.chrmt.comp.txt | |
| results_file = 'consensus.x.chrmt.comp.txt' | |
| blast_df = BLAST_to_df(results_file) | |
| def order_low_to_high(row): | |
| ''' | |
| Take start and end contens and puts them in order so lowest first and | |
| highest second. | |
| To be used to help arrange BLAST results so match convention ENSEMBL uses | |
| for reporting intervals of genes. | |
| Because came from BLAST results, `start` and `end` are `sstart` and `send`. | |
| ''' | |
| if row.sstart == row.send: | |
| # shouldn't happen but putting here to cover it | |
| return row | |
| else: | |
| items = [row.sstart,row.send] | |
| row.sstart , row.send = min(items),max(items) | |
| return row | |
| def midpoint(items): | |
| ''' | |
| takes a iterable of | |
| ''' | |
| return int((int(items[0])+int(items[1]))/2) | |
| list_to_add = [] | |
| current_qseid = "" | |
| ids_in_previous_categories = 0 | |
| for row in blast_df.itertuples(): | |
| if row.qseqid == current_qseid: | |
| id_num_for_element = (row.Index+1) - ids_in_previous_categories | |
| else: | |
| current_qseid = row.qseqid | |
| ids_in_previous_categories = row.Index | |
| id_num_for_element = 1 | |
| #list_to_add.append("{}-{}".format(row.qseqid,row.Index+1)) # gave unique number among all GC-clustes | |
| list_to_add.append("{}-{}".format(row.qseqid,id_num_for_element)) # gives unique number per type | |
| #print (row.Index) | |
| list_to_add | |
| df["sys_gene_id"] = list_to_add | |
| df["gene_symbol"] = None | |
| df = blast_df[["sys_gene_id","gene_symbol","sstart","send","sframe"]] | |
| # Make ORDER OF START AND END matchENSEMBL | |
| df = df.apply(order_low_to_high, axis=1) | |
| # rename columns to match previous conventions | |
| df = df.rename(columns = { | |
| 'sstart':'start', | |
| 'send':'end', | |
| 'sframe':'strand', | |
| }) | |
| df['midpoint'] = df[['start','end']].apply(midpoint, axis=1)# Easiest | |
| # to be consistent with previous convention of column order. | |
| columns = ['sys_gene_id','gene_symbol', | |
| 'start','end','midpoint','strand'] | |
| df = df[columns] | |
| df.to_pickle("df_for_merging.pkl") | |
| df.head() | |
| # see also [A compilation of conversion tools for BED, SAM/BAM, psl, pslx, blast tabular and blast xml](http://bioinformatics.cvr.ac.uk/blog/a-compilation-of-conversion-tools-for-bed-sambam-psl-pslx-blast-tabular-and-blast-xml/) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment