Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save hiraksarkar/25f9cc89be0471f4e6666ee9eeebd940 to your computer and use it in GitHub Desktop.
Save hiraksarkar/25f9cc89be0471f4e6666ee9eeebd940 to your computer and use it in GitHub Desktop.
Python file for coverage plots
import glob
import pandas as pd
import tqdm
#from pyfasta import Fasta
#%matplotlib inline
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import seaborn as sns
import numpy as np
import pysam
import os
import sys
sys.path.insert(0, '/home/hirak/Projects/Linguistic_Analysis_SRAdb/notebook/contamenation_metadata_analysis/')
from contamination import brokenaxes
taxadf = pd.read_csv(
'/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/taxadf.tsv',
sep = '\t'
)
length_df = pd.read_csv(
'/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/combined/seq_length.tsv',
header = None,
sep = '\t'
)
ref_length = dict(zip(length_df[0],length_df[1]))
import re
def nonzero_intervals(vec):
'''
Find islands of non-zeros in the vector vec
'''
if len(vec)==0:
return []
elif not isinstance(vec, np.ndarray):
vec = np.array(vec)
edges, = np.nonzero(np.diff((vec==0)*1))
edge_vec = [edges+1]
if vec[0] != 0:
edge_vec.insert(0, [0])
if vec[-1] != 0:
edge_vec.append([len(vec)])
edges = np.concatenate(edge_vec)
return tuple(zip(edges[::2], edges[1::2]))
def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
"""
import unicodedata
# value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
value = unicode(re.sub('[-\s]+', '-', value))
return value
def merge_plots(org_name):
if '/' in org_name:
org_name_mod = org_name.replace('/',' ')
else:
org_name_mod = org_name
org_name_mod = slugify(org_name_mod)
fig, ax = plt.subplots(figsize=(10,7))
txid = taxadf.loc[taxadf.names == org_name]['txid'].values[0]
code_names = taxadf.loc[taxadf.txid == txid]['organism'].values
lengths = [ref_length[code_name] for code_name in code_names]
ax.set_xlim(right=max(lengths))
top_srr = raw_abundance_matrix.loc[txid].nlargest(1).index.tolist()
#print('SRR ID: ', srr_name)
#os.system('mkdir /mnt/scratch4/meta_genome/coverage_plots_metagenome_merged_top_10/' + org_name_mod)
for srr in top_srr:
coverage_array = np.empty(max(lengths))
coverage_array.fill(0.0)
covfile = '/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/pufferfish_phiX/'+srr+'/HumanBacViral_sr0.85.txt'
srr_name_file = '/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/coverage_plots_June_26/{}.list'.format(org_name_mod)
fp = open(srr_name_file, 'w')
df_cov = pd.read_csv(
covfile
)
df_cov = df_cov.loc[df_cov.refId == txid]
df_cov = df_cov.drop_duplicates(['readName','refId'])
if(len(df_cov) > 0):
for row in df_cov[['refPos','score']].iterrows():
coverage_array[row[1][0]:row[1][0] + row[1][1]] += 1
coverage_array.dump(
'/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/coverage_plots_June_26/{}_{}.dat'.format(org_name_mod,srr)
)
coverage_array[coverage_array < 100] = 'nan'
coverage_intervals = nonzero_intervals(coverage_array)
# Write down the names of the sequences
for a,b in coverage_intervals:
names = df_cov.loc[(df_cov.refPos > a - 100) & (df_cov.refPos < b + 100)].drop_duplicates(['readName', 'refPos']).readName.values
for name in names:
fp.write('{}\n'.format(name))
fp.write('--\n')
#coverage_array[coverage_array == 0] = 'nan'
#fig = plt.figure(figsize=(10,7))
#bax = brokenaxes.brokenaxes(xlims=tuple(coverage_intervals), hspace=.05, yscale='log')
#bax.semilogy(coverage_array, 'o', markersize=7, label=srr)
#bax.grid(axis='both', which='major', ls='-')
#bax.grid(axis='both', which='minor', ls='--', alpha=0.4)
ax.plot(coverage_array , 'o',linewidth=1.2, markersize=10, label= '{0}'.format(srr), alpha=0.5)
plt.legend(loc=2, borderaxespad=0., numpoints = 1, markerscale = 1)
#bax.set_xlabel('Genome Index')
#bax.set_ylabel('Coverage Score')
plt.yscale('log')
plt.savefig('/mnt/scratch7/hirak/SRA_contamination_analysis/feb_11/coverage_plots_June_26/{}.cov.png'.format(org_name_mod))
plt.close()
fp.close()
raw_abundance_matrix = pd.read_csv(
'/mnt/scratch7/hirak/SRA_contamination_analysis/GSE_metadata_dataframes/abundance_raw_973.tsv',
index_col = 0,
sep = '\t'
)
abundance_matrix = pd.read_csv(
'/mnt/scratch7/hirak/SRA_contamination_analysis/GSE_metadata_dataframes/abundance_973.tsv',
index_col = 0,
sep = '\t'
)
top_50_list = abundance_matrix.T.sum().nlargest(500).index.tolist()
for org in top_50_list[100:]:
print('Plotting for {}'.format(org))
merge_plots(org)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment