Skip to content

Instantly share code, notes, and snippets.

@dela3499
Last active September 22, 2015 22:24
Show Gist options
  • Save dela3499/b31cc04aadb84e7220d4 to your computer and use it in GitHub Desktop.
Save dela3499/b31cc04aadb84e7220d4 to your computer and use it in GitHub Desktop.
Tools for gene expression analysis
# Download or retrieve dataset in SOFT format
soft_filepath = "./GSE11882.soft.gz"
gse = GEOparse.get_GEO(filepath = soft_filepath)
# Parse metadata for individual samples
# [String] -> Dict (individual_id, brain_region, gender, age)
def parse_characteristics(c):
parse = lambda i: c[i].split(':')[1].strip()
return dict(
individual_id = parse(0).replace(' ',''),
brain_region = parse(1),
gender = parse(2),
age = int(parse(3)),
)
# Int -> String
def label_age_group(age):
if 20 <= age < 39:
return "20-39"
elif 40 <= age < 59:
return "40-59"
elif 60 <= age < 79:
return "60-79"
elif 80 <= age <= 99:
return "80-99"
else:
return ""
# Dict -> Dict
def parse_metadata(metadata):
""" Given a sample's metadata dictionary,
parse it to return a new dictionary. """
my_dict = parse_characteristics(metadata['characteristics_ch1'])
my_dict['age group'] = label_age_group(my_dict['age'])
my_dict['binary age group'] = '<60' if my_dict['age'] < 60 else '60+'
my_dict['sample id'] = metadata['geo_accession'][0]
return my_dict
# Transform SOFT format into two dataframes: a metadata index, and expression data
meta = create_metadata_index(parse_metadata, gse)
exprs = get_expression_data(gse)
# Save transformed data for later use
meta.to_csv('/notebooks/tmp/GSE11882_meta.csv')
exprs.to_csv('/notebooks/tmp/GSE11882_exprs.csv') # 20 seconds
# Perform some basic analysis
get_stats = \
lambda exprtype: \
compare_groups(
exprs[exprtype],
meta.groupby('binary age group'),
'<60',
'60+')
gcrma_stats = get_stats('VALUE')
plier_stats = get_stats('PLIER')
show_stats(gcrma_stats)
show_stats(plier_stats)
# {Expr DataFrame} -> GroupBy object -> DataFrame (group, n, n_increased, n_decreased)
def get_young_old_probe_changes_across_groups(exprs, groupby):
""" Given a groupby object, find the summary of gene changes
between old and young for each group it contains."""
summaries = []
for name, group in groupby:
summary = \
summarize_probes(
get_significant_probes(
exprs,
group.groupby('binary age group'),
'<60',
'60+'))
summary['group'] = name
summaries.append(summary)
return pd.DataFrame(summaries)
brain_region_gene_changes = \
get_young_old_probe_changes_across_groups(
exprs,
meta.groupby('brain_region'))
brain_region_gene_changes \
[['group','n_decreased','n_increased']] \
.sort('n_decreased', ascending = False) \
.plot(kind = 'bar',
x = 'group',
stacked = True)
plt.ylabel('# of gene changes')
plt.xlabel('Brain region')
plt.gca().set_xticklabels(['SFG', 'PCG', 'HC', 'EC'], rotation = 0)
plt.title('# of gene changes across the brain');

create_metadata_index

(Dict (gsm.metadata) -> Dict (fields)) -> GSE -> Dataframe (fields)

Parse and combine the metadata from each sample in the provided gse object.


Exprs = DataFrame (rows = probe sites, columns = (expression type, sample id))


get_expression_data

GSE -> Exprs

Given a gse object, collect each sample's expression data into a dictionary of dataframes, with one key-value pair for each column of expression data.


compare_subsets

Expr Dataframe -> [Sample ID] -> [Sample ID] -> Dataframe (t-statistic, p-value)

Given an expression dataframe, compare the subsets specified by each list of sample ids.


compare_groups

Expr Dataframe -> Groupby Object -> String -> String -> Dataframe (t-statistic, p-value)

Compare the given pair groups of the provided groupby object.


show_stats

DataFrame (T-statistic, P-value) -> Side Effects (figure)

Show distribution of T-statistic and P-value for each probe site.


probes_from_stats

[Stats DataFrame] -> Float -> Dataframe (T-statistic, direction)

Given a list of dataframes containing comparison stats, return dataframe containing probe sites with p-values below threshold.

Two-columns are returned: t-statistic and direction. T-statistic is drawn from first stats dataframe.


get_significant_probes

{Expr} -> Groupby Object -> String -> String -> optional Float -> DataFrame (t-statistic, direction)

Given a dictionary of expression dataframes, a groupby object, and a pair of groups to compare, return a dataframe containing the probes that show a significant change.

def get_significant_probes(exprs, groupby, group1, group2, p_threshold = 0.01):


summarize_probes

DataFrame (t-statistic, direction) -> {n,n_increased,n_decreased}

Given a set of significant probes, and the direction of their change, return a summary.

import pandas as pd
import numpy as np
import GEOparse
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
import toolz as tz
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.stats import ttest_ind
# (Dict (gsm.metadata) -> Dict (fields)) -> GSE -> Dataframe (fields)
def create_metadata_index(parse_func, gse):
""" Parse and combine the metadata from each sample in the provided gse object."""
return pd.DataFrame([parse_func(gsm.metadata) for _ , gsm in gse.gsms.iteritems()])
# Exprs = DataFrame (rows = probe sites, columns = (expression type, sample id))
# GSE -> Exprs
def get_expression_data(gse):
""" Given a gse object, collect each sample's expression data into
a dictionary of dataframes, with one key-value pair for each column of expression data. """
table0 = list(gse.gsms.iteritems())[0][1].table
expression_types = table0.columns[1:].tolist()
gene_list = table0['ID_REF']
sample_ids = gse.gsms.keys()
exprs = {}
for expr_type in expression_types:
expr = \
pd.DataFrame(np.zeros((len(gene_list),
len(sample_ids))),
index = gene_list,
columns = sample_ids)
for sample_id in sample_ids:
expr[sample_id] = gse.gsms[sample_id].table[expr_type].values
exprs[expr_type] = expr
return combine_dataframes_from_dict(exprs)
# Expr Dataframe -> [Sample ID] -> [Sample ID] -> Dataframe (t-statistic, p-value)
def compare_subsets(expr, samples_a, samples_b):
""" Given an expression dataframe, compare the subsets specified by each list of sample ids. """
probe_ids = expr.index.values
data_a = expr[samples_a]
data_b = expr[samples_b]
effect_size = (data_b.mean(axis = 1) / data_a.mean(axis = 1)).values
t,p = ttest_ind(data_b, data_a, axis = 1)
return pd.DataFrame(
np.array([effect_size,t,p]).T,
columns = ['effect size','t-statistic','p-value'],
index = probe_ids)
# Expr Dataframe -> Groupby Object -> String -> String -> Dataframe (t-statistic, p-value)
def compare_groups(expr, groupby, group1, group2):
""" Compare the given pair groups of the provided groupby object. """
get_ids = lambda group_name: groupby.get_group(group_name)['sample id']
return compare_subsets(
expr,
get_ids(group1),
get_ids(group2))
# DataFrame (T-statistic, P-value) -> Side Effects (figure)
def show_stats(stats):
""" Show distribution of T-statistic and P-value for each probe site. """
plt.figure(figsize = (16,3))
plt.subplot(1,3,1)
bins = np.linspace(-10,10,50)
stats['t-statistic'].plot(kind = 'hist', bins = bins, color = 'grey')
stats[stats['p-value'] < 0.01]['t-statistic'].plot(kind = 'hist', bins = bins, alpha = 0.4, color = 'red')
plt.title('Distribution of T-statistic across all probe sites\n(overlaid in red with values\ncorresponding to P < 0.01)')
plt.xlabel('T-statistic')
plt.ylabel('Count')
plt.xlim(left = -10, right = 10)
plt.subplot(1,3,2)
stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), color = 'grey');
plt.xscale('log')
plt.title('Distribution of P-value across all probe sites')
plt.xlabel('P-value')
plt.ylabel('Count')
ymin,ymax = plt.gca().get_ylim()
plt.vlines(0.01,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2)
plt.vlines(0.05,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2)
plt.text(0.07, 0.9 * ymax, '0.05', color = 'k')
plt.text(0.007, 0.9 * ymax, '0.01', color = 'k', ha = 'right')
plt.subplot(1,3,3)
stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), cumulative = True, normed = 1, color = 'grey');
plt.xscale('log')
plt.title('Cumulative distribution of P-value across all probe sites')
plt.xlabel('P-value')
plt.vlines(0.01,0,1, color = 'k', linestyle = '--', linewidth = 2)
plt.vlines(0.05,0,1, color = 'k', linestyle = '--', linewidth = 2)
plt.text(0.07, 0.9, '0.05', color = 'k')
plt.text(0.007, 0.9, '0.01', color = 'k', ha = 'right')
plt.ylim(top = 1)
plt.ylabel('Fraction of genes');
# [Stats DataFrame] -> Float -> Dataframe (T-statistic, direction)
def probes_from_stats(stats_dfs, p_threshold):
""" Given a list of dataframes containing comparison stats,
return dataframe containing probe sites with p-values below threshold.
Two-columns are returned: t-statistic and direction.
T-statistic is drawn from first stats dataframe. """
where_p_is_below_threshold = \
reduce(lambda x,y: x & y,
[stats_df['p-value'] < p_threshold for stats_df in stats_dfs])
significant_probe_sites = stats_dfs[0][where_p_is_below_threshold][['t-statistic','effect size']]
significant_probe_sites['direction'] = \
significant_probe_sites['t-statistic'].map(lambda t: 'increased' if t > 0 else 'decreased')
return significant_probe_sites
# {Expr} -> Groupby Object -> String -> String -> DataFrame (t-statistic, direction)
def get_significant_probes(exprs, groupby, group1, group2, p_threshold = 0.01):
""" Given a dictionary of expression dataframes, a groupby object, and a pair of groups to compare,
return a dataframe containing the probes that show a significant change. """
return \
probes_from_stats(
map(lambda expr: compare_groups(expr, groupby, group1, group2),
split_on_first_level(exprs)),
p_threshold)
# DataFrame (t-statistic, direction) -> {n,n_increased,n_decreased}
def summarize_probes(probes):
""" Given a set of significant probes, and the direction of their change, return a summary. """
return dict(
n = len(probes),
n_increased = len(probes[probes['direction'] == 'increased']),
n_decreased = len(probes[probes['direction'] == 'decreased']))
# DataFrame -> String
def add_column_level(df,value):
""" Return new dataframe by adding value as the first level of a compound column name. (for all columns). """
new = df.copy().T
new['level_0'] = value
new = new.set_index('level_0',append = True).swaplevel(0,1)
return new.T
# {String: DataFrame} -> DataFrame (with compound column names)
def combine_dataframes_from_dict(df_dict):
""" Given a dictionary of dataframes, join them on index,
and encode their keys as the first level of a compound column name. """
new_dfs = map(lambda (label, dataframe): add_column_level(dataframe, label), df_dict.iteritems())
return reduce(lambda df1,df2: pd.merge(df1, df2, left_index = True, right_index = True), new_dfs)
# DataFrame (with compound column names) -> [DataFrame]
def split_on_first_level(df):
""" Return list of dataframes by splitting on first element of compound column name. """
return [df[level_0] for level_0 in df.columns.levels[0]]
# String -> DataFrame
def read_expr_csv(filepath):
""" Given filepath to CSV, return exprs dataframe.
(This function just imports the CSV with a compound header and an index.)"""
return pd.read_csv(filepath,header = [0,1], index_col = 0)
# [a] -> [a] -> (Int, Int, Int)
def venn(x,y):
""" Given two lists, return a triple containing the following:
- how many unique elements are found only in first list
- how many unique elements are found in both lists
- how many unique elements are found only in second list """
x_set = set(x)
y_set = set(y)
return map(len,
[x_set.difference(y_set),
x_set.intersection(y_set),
y_set.difference(x_set)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment