|
import pandas as pd |
|
import numpy as np |
|
import GEOparse |
|
import matplotlib.pyplot as plt |
|
import matplotlib.gridspec as gridspec |
|
%matplotlib inline |
|
import toolz as tz |
|
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list |
|
from scipy.stats import ttest_ind |
|
|
|
# (Dict (gsm.metadata) -> Dict (fields)) -> GSE -> Dataframe (fields) |
|
def create_metadata_index(parse_func, gse): |
|
""" Parse and combine the metadata from each sample in the provided gse object.""" |
|
return pd.DataFrame([parse_func(gsm.metadata) for _ , gsm in gse.gsms.iteritems()]) |
|
|
|
# Exprs = DataFrame (rows = probe sites, columns = (expression type, sample id)) |
|
# GSE -> Exprs |
|
def get_expression_data(gse): |
|
""" Given a gse object, collect each sample's expression data into |
|
a dictionary of dataframes, with one key-value pair for each column of expression data. """ |
|
|
|
table0 = list(gse.gsms.iteritems())[0][1].table |
|
expression_types = table0.columns[1:].tolist() |
|
gene_list = table0['ID_REF'] |
|
sample_ids = gse.gsms.keys() |
|
exprs = {} |
|
for expr_type in expression_types: |
|
expr = \ |
|
pd.DataFrame(np.zeros((len(gene_list), |
|
len(sample_ids))), |
|
index = gene_list, |
|
columns = sample_ids) |
|
|
|
for sample_id in sample_ids: |
|
expr[sample_id] = gse.gsms[sample_id].table[expr_type].values |
|
|
|
exprs[expr_type] = expr |
|
|
|
return combine_dataframes_from_dict(exprs) |
|
|
|
# Expr Dataframe -> [Sample ID] -> [Sample ID] -> Dataframe (t-statistic, p-value) |
|
def compare_subsets(expr, samples_a, samples_b): |
|
""" Given an expression dataframe, compare the subsets specified by each list of sample ids. """ |
|
probe_ids = expr.index.values |
|
|
|
data_a = expr[samples_a] |
|
data_b = expr[samples_b] |
|
|
|
effect_size = (data_b.mean(axis = 1) / data_a.mean(axis = 1)).values |
|
|
|
t,p = ttest_ind(data_b, data_a, axis = 1) |
|
return pd.DataFrame( |
|
np.array([effect_size,t,p]).T, |
|
columns = ['effect size','t-statistic','p-value'], |
|
index = probe_ids) |
|
|
|
# Expr Dataframe -> Groupby Object -> String -> String -> Dataframe (t-statistic, p-value) |
|
def compare_groups(expr, groupby, group1, group2): |
|
""" Compare the given pair groups of the provided groupby object. """ |
|
get_ids = lambda group_name: groupby.get_group(group_name)['sample id'] |
|
return compare_subsets( |
|
expr, |
|
get_ids(group1), |
|
get_ids(group2)) |
|
|
|
# DataFrame (T-statistic, P-value) -> Side Effects (figure) |
|
def show_stats(stats): |
|
""" Show distribution of T-statistic and P-value for each probe site. """ |
|
plt.figure(figsize = (16,3)) |
|
|
|
plt.subplot(1,3,1) |
|
bins = np.linspace(-10,10,50) |
|
stats['t-statistic'].plot(kind = 'hist', bins = bins, color = 'grey') |
|
stats[stats['p-value'] < 0.01]['t-statistic'].plot(kind = 'hist', bins = bins, alpha = 0.4, color = 'red') |
|
plt.title('Distribution of T-statistic across all probe sites\n(overlaid in red with values\ncorresponding to P < 0.01)') |
|
plt.xlabel('T-statistic') |
|
plt.ylabel('Count') |
|
plt.xlim(left = -10, right = 10) |
|
|
|
plt.subplot(1,3,2) |
|
stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), color = 'grey'); |
|
plt.xscale('log') |
|
plt.title('Distribution of P-value across all probe sites') |
|
plt.xlabel('P-value') |
|
plt.ylabel('Count') |
|
ymin,ymax = plt.gca().get_ylim() |
|
plt.vlines(0.01,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2) |
|
plt.vlines(0.05,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2) |
|
plt.text(0.07, 0.9 * ymax, '0.05', color = 'k') |
|
plt.text(0.007, 0.9 * ymax, '0.01', color = 'k', ha = 'right') |
|
|
|
plt.subplot(1,3,3) |
|
stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), cumulative = True, normed = 1, color = 'grey'); |
|
plt.xscale('log') |
|
plt.title('Cumulative distribution of P-value across all probe sites') |
|
plt.xlabel('P-value') |
|
plt.vlines(0.01,0,1, color = 'k', linestyle = '--', linewidth = 2) |
|
plt.vlines(0.05,0,1, color = 'k', linestyle = '--', linewidth = 2) |
|
plt.text(0.07, 0.9, '0.05', color = 'k') |
|
plt.text(0.007, 0.9, '0.01', color = 'k', ha = 'right') |
|
plt.ylim(top = 1) |
|
plt.ylabel('Fraction of genes'); |
|
|
|
# [Stats DataFrame] -> Float -> Dataframe (T-statistic, direction) |
|
def probes_from_stats(stats_dfs, p_threshold): |
|
""" Given a list of dataframes containing comparison stats, |
|
return dataframe containing probe sites with p-values below threshold. |
|
Two-columns are returned: t-statistic and direction. |
|
T-statistic is drawn from first stats dataframe. """ |
|
where_p_is_below_threshold = \ |
|
reduce(lambda x,y: x & y, |
|
[stats_df['p-value'] < p_threshold for stats_df in stats_dfs]) |
|
significant_probe_sites = stats_dfs[0][where_p_is_below_threshold][['t-statistic','effect size']] |
|
significant_probe_sites['direction'] = \ |
|
significant_probe_sites['t-statistic'].map(lambda t: 'increased' if t > 0 else 'decreased') |
|
return significant_probe_sites |
|
|
|
# {Expr} -> Groupby Object -> String -> String -> DataFrame (t-statistic, direction) |
|
def get_significant_probes(exprs, groupby, group1, group2, p_threshold = 0.01): |
|
""" Given a dictionary of expression dataframes, a groupby object, and a pair of groups to compare, |
|
return a dataframe containing the probes that show a significant change. """ |
|
return \ |
|
probes_from_stats( |
|
map(lambda expr: compare_groups(expr, groupby, group1, group2), |
|
split_on_first_level(exprs)), |
|
p_threshold) |
|
|
|
# DataFrame (t-statistic, direction) -> {n,n_increased,n_decreased} |
|
def summarize_probes(probes): |
|
""" Given a set of significant probes, and the direction of their change, return a summary. """ |
|
return dict( |
|
n = len(probes), |
|
n_increased = len(probes[probes['direction'] == 'increased']), |
|
n_decreased = len(probes[probes['direction'] == 'decreased'])) |
|
|
|
# DataFrame -> String |
|
def add_column_level(df,value): |
|
""" Return new dataframe by adding value as the first level of a compound column name. (for all columns). """ |
|
new = df.copy().T |
|
new['level_0'] = value |
|
new = new.set_index('level_0',append = True).swaplevel(0,1) |
|
return new.T |
|
|
|
# {String: DataFrame} -> DataFrame (with compound column names) |
|
def combine_dataframes_from_dict(df_dict): |
|
""" Given a dictionary of dataframes, join them on index, |
|
and encode their keys as the first level of a compound column name. """ |
|
new_dfs = map(lambda (label, dataframe): add_column_level(dataframe, label), df_dict.iteritems()) |
|
return reduce(lambda df1,df2: pd.merge(df1, df2, left_index = True, right_index = True), new_dfs) |
|
|
|
# DataFrame (with compound column names) -> [DataFrame] |
|
def split_on_first_level(df): |
|
""" Return list of dataframes by splitting on first element of compound column name. """ |
|
return [df[level_0] for level_0 in df.columns.levels[0]] |
|
|
|
# String -> DataFrame |
|
def read_expr_csv(filepath): |
|
""" Given filepath to CSV, return exprs dataframe. |
|
(This function just imports the CSV with a compound header and an index.)""" |
|
return pd.read_csv(filepath,header = [0,1], index_col = 0) |
|
|
|
# [a] -> [a] -> (Int, Int, Int) |
|
def venn(x,y): |
|
""" Given two lists, return a triple containing the following: |
|
- how many unique elements are found only in first list |
|
- how many unique elements are found in both lists |
|
- how many unique elements are found only in second list """ |
|
|
|
x_set = set(x) |
|
y_set = set(y) |
|
return map(len, |
|
[x_set.difference(y_set), |
|
x_set.intersection(y_set), |
|
y_set.difference(x_set)]) |
|
|