Last active
November 22, 2019 16:48
-
-
Save wangfan860/0534ebf09154a1e33f28aa126f10fd4f to your computer and use it in GitHub Desktop.
For comparison between TCGA-BRCA real expression with predicted gene expression. Raw Pearson is low even in normal tissue samples. Integrate quantile normalization and inverse quantile normalization to process the real expression in the last part
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gtfparse import read_gtf | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import gzip | |
import subprocess | |
import scipy.stats as stats | |
import argparse | |
import os | |
from scipy.stats import norm | |
print('reading gencode data') | |
df = read_gtf("/mnt/SCRATCH/topmed_predixcan/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz") | |
# filter DataFrame to gene only | |
df_genes = df[df["feature"] == "gene"] | |
df_genes1=df_genes[['gene_id','transcript_name']] | |
dictionary = dict(zip(df_genes1['gene_id'],df_genes1['transcript_name'])) | |
print('reading predixcan output data') | |
predict = pd.read_csv('/mnt/SCRATCH/TCGA_prediXcan/output/predixcan_output/_predicted_expression.txt', sep='\t') | |
predict1=predict.drop(['FID'], axis=1) | |
# Intersection of two lists | |
def intersection(lst1, lst2): | |
return list(set(lst1) & set(lst2)) | |
len(predict1.columns) | |
#5308 | |
len(intersection(predict1.columns, df_genes1.gene_id)) | |
#5307 | |
# replace column names in predict1 with official gene name | |
new_header=predict1.columns.map(dictionary).fillna('sample') | |
predict2=predict1 | |
predict2.columns=new_header | |
predict2['sample']=predict2['sample'].str.split('-10').str.get(0) | |
# turn a dataframe into a single list | |
def make_list(df): | |
result=[] | |
for i,n in enumerate(df.columns): | |
result.extend(df.values[i]) | |
return result | |
print('reading tcga expression data') | |
real = pd.read_csv('/mnt/SCRATCH/TCGA_prediXcan/output/predixcan_output/gdac.broadinstitute.org_BRCA.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt', sep='\t',skiprows=[1]) | |
#real expression contains both tumor and normal | |
#In [32]: set(real.columns.to_series().str.split('-').str.get(3)) | |
#Out[32]: {'01A', '01B', '06A', '11A', '11B', nan} | |
tumor=real[real.columns[real.columns.to_series().str.contains('01A|01B|Hybridization')]] | |
tumor_pt=tumor.columns.to_series().str.split('-01').str.get(0).tolist() | |
tumor.columns=tumor_pt | |
tumor['gene']=tumor['Hybridization REF'].str.split('|').str.get(0) | |
tumor_data=make_list(tumor.drop(['Hybridization REF','gene'],axis=1)) | |
print("min of tumor: {} ".format(min(tumor_data))) | |
tumor_no_zero=[0.001 if x==0 else x for x in tumor_data] | |
#dim (20531, 1093) | |
normal=real[real.columns[real.columns.to_series().str.contains('11A|11B|Hybridization')]] | |
normal_pt=normal.columns.to_series().str.split('-11').str.get(0).tolist() | |
normal.columns=normal_pt | |
normal['gene']=normal['Hybridization REF'].str.split('|').str.get(0) | |
normal_data=make_list(normal.drop(['Hybridization REF','gene'],axis=1)) | |
print("min of normal: {} ".format(min(normal_data))) | |
normal_no_zero=[0.001 if x==0 else x for x in normal_data] | |
#dim (20531, 112) | |
metastatic = real[real.columns[real.columns.to_series().str.contains('06A|Hybridization')]] | |
metastatic_pt=metastatic.columns.to_series().str.split('-06').str.get(0).tolist() | |
metastatic.columns=metastatic_pt | |
metastatic['gene']=metastatic['Hybridization REF'].str.split('|').str.get(0) | |
metastatic_data=make_list(metastatic.drop(['Hybridization REF','gene'],axis=1)) | |
print("min of metastatic: {} ".format(min(metastatic_data))) | |
metastatic_no_zero=[0.001 if x==0 else x for x in metastatic_data] | |
#dim (20531, 7) | |
print('making plot') | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( np.log10(tumor_no_zero) , color="red", label="BRCA_tumor") | |
sns_plot=sns.distplot( np.log10(normal_no_zero) , color="green", label="BRCA_normal") | |
sns_plot=sns.distplot( np.log10(metastatic_no_zero) , color="orange", label="BRCA_metastatic") | |
plt.legend() | |
plt.xlabel('expression', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/expression_distribution.png',dpi=300, facecolor='w') | |
# set the threshold as -2, so genes with more than 80% patients' expression lower than 0.01 will be removed. | |
## 1093*0.8= | |
tumor1=tumor[tumor.gene != '?'] | |
tumor2=tumor1.drop(['Hybridization REF'], axis=1) | |
tumor3 = tumor2.set_index('gene').T | |
tumor_filter=tumor3.loc[:,tumor3[tumor3<0.01].count()<874] | |
tumor_filter['sample']=tumor_filter.index | |
## 112*0.8 | |
normal1=normal[normal.gene != '?'] | |
normal2=normal1.drop(['Hybridization REF'], axis=1) | |
normal3 = normal2.set_index('gene').T | |
normal_filter=normal3.loc[:,normal3[normal3<0.01].count()<89] | |
normal_filter['sample']=normal_filter.index | |
## 7*0.8 | |
metastatic1=metastatic[metastatic.gene != '?'] | |
metastatic2=metastatic1.drop(['Hybridization REF'], axis=1) | |
metastatic3 = metastatic2.set_index('gene').T | |
metastatic_filter=metastatic3.loc[:,normal3[normal3<0.01].count()<5] | |
metastatic_filter['sample']=metastatic_filter.index | |
#prepare tumor and prediction | |
sample_t_p=set(tumor_filter['sample']).intersection(predict2['sample']) | |
gene_t_p=set(tumor_filter.columns).intersection(predict2.columns) | |
tumor_tocompare=tumor_filter.loc[tumor_filter['sample'].isin(sample_t_p),gene_t_p] | |
tumor_predict=predict2.loc[predict2['sample'].isin(sample_t_p),gene_t_p] | |
tumor_tocompare=tumor_tocompare.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1) | |
tumor_predict=tumor_predict.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1) | |
tumor_corr=tumor_predict.corrwith(tumor_tocompare, axis=0, method='pearson') | |
#prepare normal and prediction | |
sample_t_p=set(normal_filter['sample']).intersection(predict2['sample']) | |
gene_t_p=set(normal_filter.columns).intersection(predict2.columns) | |
normal_tocompare=normal_filter.loc[normal_filter['sample'].isin(sample_t_p),gene_t_p] | |
normal_predict=predict2.loc[predict2['sample'].isin(sample_t_p),gene_t_p] | |
normal_tocompare=normal_tocompare.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1) | |
normal_predict=normal_predict.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1) | |
normal_corr=normal_predict.corrwith(normal_tocompare, axis=0, method='pearson') | |
#prepare metastatic and prediction | |
sample_t_p=set(metastatic_filter['sample']).intersection(predict2['sample']) | |
gene_t_p=set(metastatic_filter.columns).intersection(predict2.columns) | |
metastatic_tocompare=metastatic_filter.loc[metastatic_filter['sample'].isin(sample_t_p),gene_t_p] | |
metastatic_predict=predict2.loc[predict2['sample'].isin(sample_t_p),gene_t_p] | |
metastatic_tocompare=metastatic_tocompare.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1) | |
metastatic_predict=metastatic_predict.sort_values(by=['sample']).drop(['sample'], axis=1).reset_index().drop(['index'],axis=1) | |
metastatic_corr=metastatic_predict.corrwith(metastatic_tocompare, axis=0, method='pearson') | |
# correlation plot | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( tumor_corr.dropna() , color="red", label="BRCA_tumor") | |
sns_plot=sns.distplot( normal_corr.dropna() , color="green", label="BRCA_normal") | |
sns_plot=sns.distplot( metastatic_corr.dropna(), color="orange", label="BRCA_metastatic") | |
plt.legend() | |
plt.xlabel('Correlation between prediction vs. true expression', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/correlation_distribution.png',dpi=300, facecolor='w') | |
# r square plot | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( (tumor_corr.dropna())**2 , label="BRCA_tumor",fit=norm,kde=False, hist=False,fit_kws={"color":"red"}) | |
sns_plot=sns.distplot( (normal_corr.dropna())**2 ,label="BRCA_normal",fit=norm,kde=False, hist=False,fit_kws={"color":"g"}) | |
sns_plot=sns.distplot( (metastatic_corr.dropna())**2,label="BRCA_metastatic",fit=norm,kde=False, hist=False,fit_kws={"color":"orange"}) | |
plt.legend() | |
plt.xlabel('R square between prediction vs. true expression', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_distribution.png',dpi=300, facecolor='w') | |
# r square plot for normal | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( (normal_corr.dropna())**2 , color="green", label="BRCA_normal") | |
plt.legend() | |
plt.xlabel('R square between prediction vs. true expression', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_normal_distribution.png',dpi=300, facecolor='w') | |
# this script quantile normalizes the expression values to average empirical distribution observed across samples, and then for each gene, expression values are inverse quantile normalized to a standard normal distribution across samples. | |
def normalize_quantiles(M, inplace=False): | |
""" | |
Note: replicates behavior of R function normalize.quantiles from library("preprocessCore") | |
Reference: | |
[1] Bolstad et al., Bioinformatics 19(2), pp. 185-193, 2003 | |
Adapted from https://github.com/andrewdyates/quantile_normalize | |
""" | |
if not inplace: | |
M = M.copy() | |
Q = M.argsort(axis=0) | |
m,n = M.shape | |
# compute quantile vector | |
quantiles = np.zeros(m) | |
for i in range(n): | |
quantiles += M[Q[:,i],i] | |
quantiles = quantiles / n | |
for i in range(n): | |
# Get equivalence classes; unique values == 0 | |
dupes = np.zeros(m, dtype=np.int) | |
for j in range(m-1): | |
if M[Q[j,i],i]==M[Q[j+1,i],i]: | |
dupes[j+1] = dupes[j]+1 | |
# Replace column with quantile ranks | |
M[Q[:,i],i] = quantiles | |
# Average together equivalence classes | |
j = m-1 | |
while j >= 0: | |
if dupes[j] == 0: | |
j -= 1 | |
else: | |
idxs = Q[j-dupes[j]:j+1,i] | |
M[idxs,i] = np.median(M[idxs,i]) | |
j -= 1 + dupes[j] | |
assert j == -1 | |
if not inplace: | |
return M | |
def inverse_quantile_normalization(M): | |
""" | |
After quantile normalization of samples, standardize expression of each gene | |
""" | |
R = stats.mstats.rankdata(M,axis=1) # ties are averaged | |
Q = stats.norm.ppf(R/(M.shape[1]+1)) | |
return Q | |
# apply quantile normalization and inverse quantile normalization before correlation | |
#prepare quantile normalized tumor | |
quant_real_tumor=pd.DataFrame.from_records(inverse_quantile_normalization(normalize_quantiles(tumor_tocompare.to_numpy()))) | |
quant_real_tumor.columns=tumor_predict.columns | |
tumor_corr_normalize=tumor_predict.corrwith(quant_real_tumor, axis=0, method='pearson') | |
#prepare quantile normalized normal | |
quant_real_normal=pd.DataFrame.from_records(inverse_quantile_normalization(normalize_quantiles(normal_tocompare.to_numpy()))) | |
quant_real_normal.columns=normal_predict.columns | |
normal_corr_normalize=normal_predict.corrwith(quant_real_normal, axis=0, method='pearson') | |
#prepare quantile normalized metastatic | |
quant_real_metastatic=pd.DataFrame.from_records(inverse_quantile_normalization(normalize_quantiles(metastatic_tocompare.to_numpy()))) | |
quant_real_metastatic.columns=metastatic_predict.columns | |
metastatic_corr_normalize=metastatic_predict.corrwith(quant_real_metastatic, axis=0, method='pearson') | |
# r correlation plot | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( tumor_corr_normalize.dropna() , color="red", label="BRCA_tumor") | |
sns_plot=sns.distplot( normal_corr_normalize.dropna() , color="green", label="BRCA_normal") | |
sns_plot=sns.distplot( metastatic_corr_normalize.dropna(), color="orange", label="BRCA_metastatic") | |
plt.legend() | |
plt.xlabel('Correlation after quantile normalization between prediction vs. true expression', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_distribution_quantile_normalized.png',dpi=300, facecolor='w') | |
# r square plot | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( (tumor_corr_normalize.dropna())**2 , color="red", label="BRCA_tumor") | |
sns_plot=sns.distplot( (normal_corr_normalize.dropna())**2 , color="green", label="BRCA_normal") | |
sns_plot=sns.distplot( (metastatic_corr_normalize.dropna())**2, color="orange", label="BRCA_metastatic") | |
plt.legend() | |
plt.xlabel('R square after quantile normalization between prediction vs. true expression', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_distribution_quantile_normalized.png',dpi=300, facecolor='w') | |
# r square plot for normal | |
plt.figure(figsize=(10,6)) | |
sns_plot=sns.distplot( (normal_corr.dropna())**2 , color="green", label="BRCA_normal_raw_expression") | |
sns_plot=sns.distplot( (normal_corr_normalize.dropna())**2 , color="orange", label="BRCA_normal_quantile_normalized") | |
plt.legend() | |
plt.xlabel('R square', fontsize=10) | |
plt.ylabel('frequency', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/r_square_normal_quantile_normalized_vs_not.png',dpi=300, facecolor='w') | |
##pca plot | |
common_sample = intersection(tumor_predict.columns, normal_predict.columns) | |
tumor_predict_pca = tumor_predict[common_sample] | |
tumor_tocompare_pca = tumor_tocompare[common_sample] | |
normal_predict_pca = normal_predict[common_sample] | |
normL_tocompare_pca = normal_tocompare[common_sample] | |
tumor_predict_pca['type'] = 'Tumor' | |
tumor_tocompare_pca['type'] = 'Tumor' | |
normal_predict_pca['type'] = 'Normal' | |
normL_tocompare_pca['type'] = 'Normal' | |
predict_all_pca = pd.concat([tumor_predict_pca, normal_predict_pca]) | |
tocompare_all_pca = pd.concat([tumor_tocompare_pca, normL_tocompare_pca]) | |
predict_all_pca.to_csv('/mnt/SCRATCH/TCGA_prediXcan/predict_all_pca.csv', index=False) | |
tocompare_all_pca.to_csv('/mnt/SCRATCH/TCGA_prediXcan/ture_expression_all_pca.csv', index=False) | |
##scater plot | |
tumor_table = tumor_corr.to_frame() | |
normal_table = normal_corr.to_frame() | |
normal_table['normal_r_square']=normal_table[0]**2 | |
normal_table.columns = ['normal_r','normal_r_square'] | |
tumor_table['tumor_r_square']=tumor_table[0]**2 | |
tumor_table.columns = ['tumor_r','tumor_r_square'] | |
merge=pd.merge(tumor_table, normal_table, left_index=True, right_index=True) | |
#plt plot | |
plt.figure(figsize=(8,8)) | |
plt.scatter('tumor_r_square', 'normal_r_square', data=merge, s=15, c='red',alpha=0.5) | |
x = np.linspace(0,0.5,100) | |
plt.plot(x, x + 0, ':b') | |
for i,type in enumerate(merge.index): | |
x = merge.tumor_r_square[i] | |
y = merge.normal_r_square[i] | |
if x > 0.15: | |
plt.text(x, y, type, fontsize=5) | |
if y > 0.4: | |
plt.text(x, y, type, fontsize=5) | |
plt.xlabel('R square in tumor', fontsize=10) | |
plt.ylabel('R square in normal', fontsize=10) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/correlation_scatter_r_square.png',dpi=300, facecolor='w') | |
##get correlation p value | |
def corr_table(predict, tocompare): | |
from scipy.stats import pearsonr | |
genes = [] | |
r = [] | |
pval = [] | |
for i in tumor_predict.columns: | |
genes.append(i) | |
correlation = pearsonr(predict[i], tocompare[i]) | |
r.append(correlation[0]) | |
pval.append(correlation[1]) | |
df = pd.DataFrame(list(zip(genes, r, pval)), columns =['gene', 'r','pval']) | |
df['r_square']=df['r']**2 | |
return df | |
tumor_corr_table= corr_table(tumor_predict, tumor_tocompare) | |
normal_corr_table= corr_table(normal_predict, normal_tocompare) | |
def plot_real_predict_expression(predict_data, tocompare_data, gene, color, corr_table): | |
import seaborn as sns; sns.set(color_codes=True) | |
plt.figure(figsize=(8,8)) | |
sns.regplot(x=predict_data[gene], y=tocompare_data[gene], ci=95,scatter_kws={"s": 10, "alpha": 0.7, "color": color}) | |
plt.xlabel("Predicted expression") | |
plt.ylabel("RNA-seq expression") | |
r_square = round(corr_table.loc[corr_table['gene']==gene, 'r_square'].values[0], 3) | |
pval = "%.3g" % corr_table.loc[corr_table['gene']==gene, 'pval'].values[0] | |
plt.title(gene+" R square = "+str(r_square) + " P-val = "+str(pval), fontsize=18) | |
plt.savefig('/mnt/SCRATCH/TCGA_prediXcan/{}_{}.png'.format(gene,color),dpi=300, facecolor='w') | |
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'ERAP2','r',tumor_corr_table) | |
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'OR7D2','r',tumor_corr_table) | |
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'HLA-DRB5','r',tumor_corr_table) | |
plot_real_predict_expression(tumor_predict, tumor_tocompare, 'THNSL2','r',tumor_corr_table) | |
plot_real_predict_expression(normal_predict, normal_tocompare, 'ERAP2','g',normal_corr_table) | |
plot_real_predict_expression(normal_predict, normal_tocompare, 'C1QL3','g',normal_corr_table) | |
plot_real_predict_expression(normal_predict, normal_tocompare, 'XRRA1','g',normal_corr_table) | |
plot_real_predict_expression(normal_predict, normal_tocompare, 'ST7L','g',normal_corr_table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment