Created
June 27, 2019 23:24
-
-
Save hiraksarkar/25f9cc89be0471f4e6666ee9eeebd940 to your computer and use it in GitHub Desktop.
Python file for coverage plots
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import pandas as pd | |
import tqdm | |
#from pyfasta import Fasta | |
#%matplotlib inline | |
import matplotlib | |
matplotlib.use('agg') | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from collections import defaultdict | |
import seaborn as sns | |
import numpy as np | |
import pysam | |
import os | |
import sys | |
sys.path.insert(0, '/home/hirak/Projects/Linguistic_Analysis_SRAdb/notebook/contamenation_metadata_analysis/') | |
from contamination import brokenaxes | |
taxadf = pd.read_csv( | |
'/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/taxadf.tsv', | |
sep = '\t' | |
) | |
length_df = pd.read_csv( | |
'/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/combined/seq_length.tsv', | |
header = None, | |
sep = '\t' | |
) | |
ref_length = dict(zip(length_df[0],length_df[1])) | |
import re | |
def nonzero_intervals(vec): | |
''' | |
Find islands of non-zeros in the vector vec | |
''' | |
if len(vec)==0: | |
return [] | |
elif not isinstance(vec, np.ndarray): | |
vec = np.array(vec) | |
edges, = np.nonzero(np.diff((vec==0)*1)) | |
edge_vec = [edges+1] | |
if vec[0] != 0: | |
edge_vec.insert(0, [0]) | |
if vec[-1] != 0: | |
edge_vec.append([len(vec)]) | |
edges = np.concatenate(edge_vec) | |
return tuple(zip(edges[::2], edges[1::2])) | |
def slugify(value): | |
""" | |
Normalizes string, converts to lowercase, removes non-alpha characters, | |
and converts spaces to hyphens. | |
""" | |
import unicodedata | |
# value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') | |
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower()) | |
value = unicode(re.sub('[-\s]+', '-', value)) | |
return value | |
def merge_plots(org_name): | |
if '/' in org_name: | |
org_name_mod = org_name.replace('/',' ') | |
else: | |
org_name_mod = org_name | |
org_name_mod = slugify(org_name_mod) | |
fig, ax = plt.subplots(figsize=(10,7)) | |
txid = taxadf.loc[taxadf.names == org_name]['txid'].values[0] | |
code_names = taxadf.loc[taxadf.txid == txid]['organism'].values | |
lengths = [ref_length[code_name] for code_name in code_names] | |
ax.set_xlim(right=max(lengths)) | |
top_srr = raw_abundance_matrix.loc[txid].nlargest(1).index.tolist() | |
#print('SRR ID: ', srr_name) | |
#os.system('mkdir /mnt/scratch4/meta_genome/coverage_plots_metagenome_merged_top_10/' + org_name_mod) | |
for srr in top_srr: | |
coverage_array = np.empty(max(lengths)) | |
coverage_array.fill(0.0) | |
covfile = '/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/pufferfish_phiX/'+srr+'/HumanBacViral_sr0.85.txt' | |
srr_name_file = '/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/coverage_plots_June_26/{}.list'.format(org_name_mod) | |
fp = open(srr_name_file, 'w') | |
df_cov = pd.read_csv( | |
covfile | |
) | |
df_cov = df_cov.loc[df_cov.refId == txid] | |
df_cov = df_cov.drop_duplicates(['readName','refId']) | |
if(len(df_cov) > 0): | |
for row in df_cov[['refPos','score']].iterrows(): | |
coverage_array[row[1][0]:row[1][0] + row[1][1]] += 1 | |
coverage_array.dump( | |
'/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/coverage_plots_June_26/{}_{}.dat'.format(org_name_mod,srr) | |
) | |
coverage_array[coverage_array < 100] = 'nan' | |
coverage_intervals = nonzero_intervals(coverage_array) | |
# Write down the names of the sequences | |
for a,b in coverage_intervals: | |
names = df_cov.loc[(df_cov.refPos > a - 100) & (df_cov.refPos < b + 100)].drop_duplicates(['readName', 'refPos']).readName.values | |
for name in names: | |
fp.write('{}\n'.format(name)) | |
fp.write('--\n') | |
#coverage_array[coverage_array == 0] = 'nan' | |
#fig = plt.figure(figsize=(10,7)) | |
#bax = brokenaxes.brokenaxes(xlims=tuple(coverage_intervals), hspace=.05, yscale='log') | |
#bax.semilogy(coverage_array, 'o', markersize=7, label=srr) | |
#bax.grid(axis='both', which='major', ls='-') | |
#bax.grid(axis='both', which='minor', ls='--', alpha=0.4) | |
ax.plot(coverage_array , 'o',linewidth=1.2, markersize=10, label= '{0}'.format(srr), alpha=0.5) | |
plt.legend(loc=2, borderaxespad=0., numpoints = 1, markerscale = 1) | |
#bax.set_xlabel('Genome Index') | |
#bax.set_ylabel('Coverage Score') | |
plt.yscale('log') | |
plt.savefig('/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/coverage_plots_June_26/{}.cov.png'.format(org_name_mod)) | |
plt.close() | |
fp.close() | |
raw_abundance_matrix = pd.read_csv( | |
'/mnt/scratch7/hirak/SRA_contamination_analysis/GSE_metadata_dataframes/abundance_raw_973.tsv', | |
index_col = 0, | |
sep = '\t' | |
) | |
abundance_matrix = pd.read_csv( | |
'/mnt/scratch7/hirak/SRA_contamination_analysis/GSE_metadata_dataframes/abundance_973.tsv', | |
index_col = 0, | |
sep = '\t' | |
) | |
top_50_list = abundance_matrix.T.sum().nlargest(500).index.tolist() | |
for org in top_50_list[100:]: | |
print('Plotting for {}'.format(org)) | |
merge_plots(org) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment