Skip to content

Instantly share code, notes, and snippets.

@wangfan860
Last active November 22, 2019 16:48
Show Gist options
  • Save wangfan860/0534ebf09154a1e33f28aa126f10fd4f to your computer and use it in GitHub Desktop.
Save wangfan860/0534ebf09154a1e33f28aa126f10fd4f to your computer and use it in GitHub Desktop.
For comparison between TCGA-BRCA real expression with predicted gene expression. Raw Pearson is low even in normal tissue samples. Integrate quantile normalization and inverse quantile normalization to process the real expression in the last part
from gtfparse import read_gtf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import gzip
import subprocess
import scipy.stats as stats
import argparse
import os
from scipy.stats import norm
print('reading gencode data')
df = read_gtf("/mnt/SCRATCH/topmed_predixcan/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz")
# filter DataFrame to gene only
df_genes = df[df["feature"] == "gene"]
df_genes1=df_genes[['gene_id','transcript_name']]
dictionary = dict(zip(df_genes1['gene_id'],df_genes1['transcript_name']))
print('reading predixcan output data')
predict = pd.read_csv('/mnt/SCRATCH/TCGA_prediXcan/output/predixcan_output/_predicted_expression.txt', sep='\t')
predict1=predict.drop(['FID'], axis=1)
# Intersection of two lists
def intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
len(predict1.columns)
#5308
len(intersection(predict1.columns, df_genes1.gene_id))
#5307
# replace column names in predict1 with official gene name
new_header=predict1.columns.map(dictionary).fillna('sample')
predict2=predict1
predict2.columns=new_header
predict2['sample']=predict2['sample'].str.split('-10').str.get(0)
# turn a dataframe into a single list
def make_list(df):
result=[]
for i,n in enumerate(df.columns):
result.extend(df.values[i])
return result
print('reading tcga expression data')
real = pd.read_csv('/mnt/SCRATCH/TCGA_prediXcan/output/predixcan_output/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt', sep='\t',skiprows=[1])
#real expression contains both tumor and normal
#In [32]: set(real.columns.to_series().str.split('-').str.get(3))
#Out[32]: {'01A', '01B', '06A', '11A', '11B', nan}
tumor=real[real.columns[real.columns.to_series().str.contains('01A|01B|Hybridization')]]
tumor_pt=tumor.columns.to_series().str.split('-01').str.get(0).tolist()
tumor.columns=tumor_pt
tumor['gene']=tumor['Hybridization REF'].str.split('|').str.get(0)
tumor_data=make_list(tumor.drop(['Hybridization REF','gene'],axis=1))
print("min of tumor: {} ".format(min(tumor_data)))
tumor_no_zero=[0.001 if x==0 else x for x in tumor_data]
#dim (20531, 1093)
normal=real[real.columns[real.columns.to_series().str.contains('11A|11B|Hybridization')]]
normal_pt=normal.columns.to_series().str.split('-11').str.get(0).tolist()
normal.columns=normal_pt
normal['gene']=normal['Hybridization REF'].str.split('|').str.get(0)
normal_data=make_list(normal.drop(['Hybridization REF','gene'],axis=1))
print("min of normal: {} ".format(min(normal_data)))
normal_no_zero=[0.001 if x==0 else x for x in normal_data]
#dim (20531, 112)
metastatic = real[real.columns[real.columns.to_series().str.contains('06A|Hybridization')]]
metastatic_pt=metastatic.columns.to_series().str.split('-06').str.get(0).tolist()
metastatic.columns=metastatic_pt
metastatic['gene']=metastatic['Hybridization REF'].str.split('|').str.get(0)
metastatic_data=make_list(metastatic.drop(['Hybridization REF','gene'],axis=1))
print("min of metastatic: {} ".format(min(metastatic_data)))
metastatic_no_zero=[0.001 if x==0 else x for x in metastatic_data]
#dim (20531, 7)
print('making plot')
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( np.log10(tumor_no_zero) , color="red", label="BRCA_tumor")
sns_plot=sns.distplot( np.log10(normal_no_zero) , color="green", label="BRCA_normal")
sns_plot=sns.distplot( np.log10(metastatic_no_zero) , color="orange", label="BRCA_metastatic")
plt.legend()
plt.xlabel('expression', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/expression_distribution.png',dpi=300, facecolor='w')
# set the threshold as -2, so genes with more than 80% patients' expression lower than 0.01 will be removed.
## 1093*0.8=
tumor1=tumor[tumor.gene != '?']
tumor2=tumor1.drop(['Hybridization REF'], axis=1)
tumor3 = tumor2.set_index('gene').T
tumor_filter=tumor3.loc[:,tumor3[tumor3<0.01].count()<874]
tumor_filter['sample']=tumor_filter.index
## 112*0.8
normal1=normal[normal.gene != '?']
normal2=normal1.drop(['Hybridization REF'], axis=1)
normal3 = normal2.set_index('gene').T
normal_filter=normal3.loc[:,normal3[normal3<0.01].count()<89]
normal_filter['sample']=normal_filter.index
## 7*0.8
metastatic1=metastatic[metastatic.gene != '?']
metastatic2=metastatic1.drop(['Hybridization REF'], axis=1)
metastatic3 = metastatic2.set_index('gene').T
metastatic_filter=metastatic3.loc[:,normal3[normal3<0.01].count()<5]
metastatic_filter['sample']=metastatic_filter.index
#prepare tumor and prediction
sample_t_p=set(tumor_filter['sample']).intersection(predict2['sample'])
gene_t_p=set(tumor_filter.columns).intersection(predict2.columns)
tumor_tocompare=tumor_filter.loc[tumor_filter['sample'].isin(sample_t_p),gene_t_p]
tumor_predict=predict2.loc[predict2['sample'].isin(sample_t_p),gene_t_p]
tumor_tocompare=tumor_tocompare.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1)
tumor_predict=tumor_predict.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1)
tumor_corr=tumor_predict.corrwith(tumor_tocompare, axis=0, method='pearson')
#prepare normal and prediction
sample_t_p=set(normal_filter['sample']).intersection(predict2['sample'])
gene_t_p=set(normal_filter.columns).intersection(predict2.columns)
normal_tocompare=normal_filter.loc[normal_filter['sample'].isin(sample_t_p),gene_t_p]
normal_predict=predict2.loc[predict2['sample'].isin(sample_t_p),gene_t_p]
normal_tocompare=normal_tocompare.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1)
normal_predict=normal_predict.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1)
normal_corr=normal_predict.corrwith(normal_tocompare, axis=0, method='pearson')
#prepare metastatic and prediction
sample_t_p=set(metastatic_filter['sample']).intersection(predict2['sample'])
gene_t_p=set(metastatic_filter.columns).intersection(predict2.columns)
metastatic_tocompare=metastatic_filter.loc[metastatic_filter['sample'].isin(sample_t_p),gene_t_p]
metastatic_predict=predict2.loc[predict2['sample'].isin(sample_t_p),gene_t_p]
metastatic_tocompare=metastatic_tocompare.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1)
metastatic_predict=metastatic_predict.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1)
metastatic_corr=metastatic_predict.corrwith(metastatic_tocompare, axis=0, method='pearson')
# correlation plot
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( tumor_corr.dropna() , color="red", label="BRCA_tumor")
sns_plot=sns.distplot( normal_corr.dropna() , color="green", label="BRCA_normal")
sns_plot=sns.distplot( metastatic_corr.dropna(), color="orange", label="BRCA_metastatic")
plt.legend()
plt.xlabel('Correlation between prediction vs. true expression', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/correlation_distribution.png',dpi=300, facecolor='w')
# r square plot
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( (tumor_corr.dropna())**2 , label="BRCA_tumor",fit=norm,kde=False, hist=False,fit_kws={"color":"red"})
sns_plot=sns.distplot( (normal_corr.dropna())**2 ,label="BRCA_normal",fit=norm,kde=False, hist=False,fit_kws={"color":"g"})
sns_plot=sns.distplot( (metastatic_corr.dropna())**2,label="BRCA_metastatic",fit=norm,kde=False, hist=False,fit_kws={"color":"orange"})
plt.legend()
plt.xlabel('R square between prediction vs. true expression', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_distribution.png',dpi=300, facecolor='w')
# r square plot for normal
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( (normal_corr.dropna())**2 , color="green", label="BRCA_normal")
plt.legend()
plt.xlabel('R square between prediction vs. true expression', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_normal_distribution.png',dpi=300, facecolor='w')
# this script quantile normalizes the expression values to average empirical distribution observed across samples, and then for each gene, expression values are inverse quantile normalized to a standard normal distribution across samples.
def normalize_quantiles(M, inplace=False):
"""
Note: replicates behavior of R function normalize.quantiles from library("preprocessCore")
Reference:
[1] Bolstad et al., Bioinformatics 19(2), pp. 185-193, 2003
Adapted from https://github.com/andrewdyates/quantile_normalize
"""
if not inplace:
M = M.copy()
Q = M.argsort(axis=0)
m,n = M.shape
# compute quantile vector
quantiles = np.zeros(m)
for i in range(n):
quantiles += M[Q[:,i],i]
quantiles = quantiles / n
for i in range(n):
# Get equivalence classes; unique values == 0
dupes = np.zeros(m, dtype=np.int)
for j in range(m-1):
if M[Q[j,i],i]==M[Q[j+1,i],i]:
dupes[j+1] = dupes[j]+1
# Replace column with quantile ranks
M[Q[:,i],i] = quantiles
# Average together equivalence classes
j = m-1
while j >= 0:
if dupes[j] == 0:
j -= 1
else:
idxs = Q[j-dupes[j]:j+1,i]
M[idxs,i] = np.median(M[idxs,i])
j -= 1 + dupes[j]
assert j == -1
if not inplace:
return M
def inverse_quantile_normalization(M):
"""
After quantile normalization of samples, standardize expression of each gene
"""
R = stats.mstats.rankdata(M,axis=1) # ties are averaged
Q = stats.norm.ppf(R/(M.shape[1]+1))
return Q
# apply quantile normalization and inverse quantile normalization before correlation
#prepare quantile normalized tumor
quant_real_tumor=pd.DataFrame.from_records(inverse_quantile_normalization(normalize_quantiles(tumor_tocompare.to_numpy())))
quant_real_tumor.columns=tumor_predict.columns
tumor_corr_normalize=tumor_predict.corrwith(quant_real_tumor, axis=0, method='pearson')
#prepare quantile normalized normal
quant_real_normal=pd.DataFrame.from_records(inverse_quantile_normalization(normalize_quantiles(normal_tocompare.to_numpy())))
quant_real_normal.columns=normal_predict.columns
normal_corr_normalize=normal_predict.corrwith(quant_real_normal, axis=0, method='pearson')
#prepare quantile normalized metastatic
quant_real_metastatic=pd.DataFrame.from_records(inverse_quantile_normalization(normalize_quantiles(metastatic_tocompare.to_numpy())))
quant_real_metastatic.columns=metastatic_predict.columns
metastatic_corr_normalize=metastatic_predict.corrwith(quant_real_metastatic, axis=0, method='pearson')
# r correlation plot
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( tumor_corr_normalize.dropna() , color="red", label="BRCA_tumor")
sns_plot=sns.distplot( normal_corr_normalize.dropna() , color="green", label="BRCA_normal")
sns_plot=sns.distplot( metastatic_corr_normalize.dropna(), color="orange", label="BRCA_metastatic")
plt.legend()
plt.xlabel('Correlation after quantile normalization between prediction vs. true expression', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_distribution_quantile_normalized.png',dpi=300, facecolor='w')
# r square plot
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( (tumor_corr_normalize.dropna())**2 , color="red", label="BRCA_tumor")
sns_plot=sns.distplot( (normal_corr_normalize.dropna())**2 , color="green", label="BRCA_normal")
sns_plot=sns.distplot( (metastatic_corr_normalize.dropna())**2, color="orange", label="BRCA_metastatic")
plt.legend()
plt.xlabel('R square after quantile normalization between prediction vs. true expression', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_distribution_quantile_normalized.png',dpi=300, facecolor='w')
# r square plot for normal
plt.figure(figsize=(10,6))
sns_plot=sns.distplot( (normal_corr.dropna())**2 , color="green", label="BRCA_normal_raw_expression")
sns_plot=sns.distplot( (normal_corr_normalize.dropna())**2 , color="orange", label="BRCA_normal_quantile_normalized")
plt.legend()
plt.xlabel('R square', fontsize=10)
plt.ylabel('frequency', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_normal_quantile_normalized_vs_not.png',dpi=300, facecolor='w')
##pca plot
common_sample = intersection(tumor_predict.columns, normal_predict.columns)
tumor_predict_pca = tumor_predict[common_sample]
tumor_tocompare_pca = tumor_tocompare[common_sample]
normal_predict_pca = normal_predict[common_sample]
normL_tocompare_pca = normal_tocompare[common_sample]
tumor_predict_pca['type'] = 'Tumor'
tumor_tocompare_pca['type'] = 'Tumor'
normal_predict_pca['type'] = 'Normal'
normL_tocompare_pca['type'] = 'Normal'
predict_all_pca = pd.concat([tumor_predict_pca, normal_predict_pca])
tocompare_all_pca = pd.concat([tumor_tocompare_pca, normL_tocompare_pca])
predict_all_pca.to_csv('/mnt/SCRATCH/TCGA_prediXcan/predict_all_pca.csv', index=False)
tocompare_all_pca.to_csv('/mnt/SCRATCH/TCGA_prediXcan/ture_expression_all_pca.csv', index=False)
##scater plot
tumor_table = tumor_corr.to_frame()
normal_table = normal_corr.to_frame()
normal_table['normal_r_square']=normal_table[0]**2
normal_table.columns = ['normal_r','normal_r_square']
tumor_table['tumor_r_square']=tumor_table[0]**2
tumor_table.columns = ['tumor_r','tumor_r_square']
merge=pd.merge(tumor_table, normal_table, left_index=True, right_index=True)
#plt plot
plt.figure(figsize=(8,8))
plt.scatter('tumor_r_square', 'normal_r_square', data=merge, s=15, c='red',alpha=0.5)
x = np.linspace(0,0.5,100)
plt.plot(x, x + 0, ':b')
for i,type in enumerate(merge.index):
x = merge.tumor_r_square[i]
y = merge.normal_r_square[i]
if x > 0.15:
plt.text(x, y, type, fontsize=5)
if y > 0.4:
plt.text(x, y, type, fontsize=5)
plt.xlabel('R square in tumor', fontsize=10)
plt.ylabel('R square in normal', fontsize=10)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/correlation_scatter_r_square.png',dpi=300, facecolor='w')
##get correlation p value
def corr_table(predict, tocompare):
from scipy.stats import pearsonr
genes = []
r = []
pval = []
for i in tumor_predict.columns:
genes.append(i)
correlation = pearsonr(predict[i], tocompare[i])
r.append(correlation[0])
pval.append(correlation[1])
df = pd.DataFrame(list(zip(genes, r, pval)), columns =['gene', 'r','pval'])
df['r_square']=df['r']**2
return df
tumor_corr_table= corr_table(tumor_predict, tumor_tocompare)
normal_corr_table= corr_table(normal_predict, normal_tocompare)
def plot_real_predict_expression(predict_data, tocompare_data, gene, color, corr_table):
import seaborn as sns; sns.set(color_codes=True)
plt.figure(figsize=(8,8))
sns.regplot(x=predict_data[gene], y=tocompare_data[gene], ci=95,scatter_kws={"s": 10, "alpha": 0.7, "color": color})
plt.xlabel("Predicted expression")
plt.ylabel("RNA-seq expression")
r_square = round(corr_table.loc[corr_table['gene']==gene, 'r_square'].values[0], 3)
pval = "%.3g" % corr_table.loc[corr_table['gene']==gene, 'pval'].values[0]
plt.title(gene+" R square = "+str(r_square) + " P-val = "+str(pval), fontsize=18)
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/{}_{}.png'.format(gene,color),dpi=300, facecolor='w')
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'ERAP2','r',tumor_corr_table)
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'OR7D2','r',tumor_corr_table)
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'HLA-DRB5','r',tumor_corr_table)
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'THNSL2','r',tumor_corr_table)
plot_real_predict_expression(normal_predict, normal_tocompare, 'ERAP2','g',normal_corr_table)
plot_real_predict_expression(normal_predict, normal_tocompare, 'C1QL3','g',normal_corr_table)
plot_real_predict_expression(normal_predict, normal_tocompare, 'XRRA1','g',normal_corr_table)
plot_real_predict_expression(normal_predict, normal_tocompare, 'ST7L','g',normal_corr_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment