dela3499 · September 22, 2015 22:24
diff --git a/example.py b/example.py

 # Download or retrieve dataset in SOFT format
 soft_filepath = "./GSE11882.soft.gz"
 gse = GEOparse.get_GEO(filepath = soft_filepath)


 # Parse metadata for individual samples

 # [String] -> Dict (individual_id, brain_region, gender, age)
 def parse_characteristics(c):
    parse = lambda i: c[i].split(':')[1].strip()
    return dict(
        individual_id = parse(0).replace(' ',''),
        brain_region = parse(1),
        gender = parse(2),
        age = int(parse(3)), 
        )

 # Int -> String
 def label_age_group(age):
    if 20 <= age < 39:
        return "20-39"
    elif 40 <= age < 59:
        return "40-59"
    elif 60 <= age < 79:
        return "60-79"
    elif 80 <= age <= 99:
        return "80-99"
    else: 
        return ""

 # Dict -> Dict
 def parse_metadata(metadata):
    """ Given a sample's metadata dictionary, 
        parse it to return a new dictionary. """
    my_dict = parse_characteristics(metadata['characteristics_ch1'])
    my_dict['age group'] = label_age_group(my_dict['age'])
    my_dict['binary age group'] = '<60' if my_dict['age'] < 60 else '60+'
    my_dict['sample id'] = metadata['geo_accession'][0]
    return my_dict
 
    
 # Transform SOFT format into two dataframes: a metadata index, and expression data
 meta = create_metadata_index(parse_metadata, gse)
 exprs = get_expression_data(gse)

 # Save transformed data for later use
 meta.to_csv('/notebooks/tmp/GSE11882_meta.csv')
 exprs.to_csv('/notebooks/tmp/GSE11882_exprs.csv') # 20 seconds


 # Perform some basic analysis

 get_stats = \
    lambda exprtype: \
        compare_groups(
            exprs[exprtype], 
            meta.groupby('binary age group'),
            '<60',
            '60+')

 gcrma_stats = get_stats('VALUE')
 plier_stats = get_stats('PLIER')

 show_stats(gcrma_stats)
 show_stats(plier_stats)

 # {Expr DataFrame} -> GroupBy object -> DataFrame (group, n, n_increased, n_decreased)
 def get_young_old_probe_changes_across_groups(exprs, groupby):
    """ Given a groupby object, find the summary of gene changes 
        between old and young for each group it contains."""
    summaries = []
    for name, group in groupby: 
        summary = \
            summarize_probes(
                get_significant_probes(
                    exprs, 
                    group.groupby('binary age group'), 
                    '<60', 
                    '60+'))
        summary['group'] = name
        summaries.append(summary)
    return pd.DataFrame(summaries)
    
 brain_region_gene_changes = \
    get_young_old_probe_changes_across_groups(
        exprs, 
        meta.groupby('brain_region'))    

 brain_region_gene_changes \
    [['group','n_decreased','n_increased']] \
    .sort('n_decreased', ascending = False) \
    .plot(kind = 'bar',
          x = 'group', 
          stacked = True)

 plt.ylabel('# of gene changes')
 plt.xlabel('Brain region')
 plt.gca().set_xticklabels(['SFG', 'PCG', 'HC', 'EC'], rotation = 0)
 plt.title('# of gene changes across the brain');
diff --git a/gene_expression.md b/gene_expression.md
diff --git a/gene_expression.py b/gene_expression.py
 import pandas as pd
 import numpy as np
 import GEOparse
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
 %matplotlib inline
 import toolz as tz
 from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
 from scipy.stats import ttest_ind

 # (Dict (gsm.metadata) -> Dict (fields)) -> GSE -> Dataframe (fields)
 def create_metadata_index(parse_func, gse):
    """ Parse and combine the metadata from each sample in the provided gse object."""
    return pd.DataFrame([parse_func(gsm.metadata) for _ , gsm in gse.gsms.iteritems()])

 # Exprs = DataFrame (rows = probe sites, columns = (expression type, sample id))
 # GSE -> Exprs
 def get_expression_data(gse):
    """ Given a gse object, collect each sample's expression data into 
        a dictionary of dataframes, with one key-value pair for each column of expression data. """
    
    table0 = list(gse.gsms.iteritems())[0][1].table
    expression_types = table0.columns[1:].tolist()
    gene_list = table0['ID_REF']
    sample_ids = gse.gsms.keys()
    exprs = {}
    for expr_type in expression_types:
        expr = \
            pd.DataFrame(np.zeros((len(gene_list),
                                   len(sample_ids))),
                         index = gene_list,
                         columns = sample_ids)
        
        for sample_id in sample_ids:
            expr[sample_id] = gse.gsms[sample_id].table[expr_type].values
        
        exprs[expr_type] = expr
        
    return combine_dataframes_from_dict(exprs)

 # Expr Dataframe -> [Sample ID] -> [Sample ID] -> Dataframe (t-statistic, p-value)
 def compare_subsets(expr, samples_a, samples_b):
    """ Given an expression dataframe, compare the subsets specified by each list of sample ids. """
    probe_ids = expr.index.values
    
    data_a = expr[samples_a]
    data_b = expr[samples_b]
    
    effect_size = (data_b.mean(axis = 1) / data_a.mean(axis = 1)).values
    
    t,p = ttest_ind(data_b, data_a, axis = 1)
    return pd.DataFrame(
                np.array([effect_size,t,p]).T,
                columns = ['effect size','t-statistic','p-value'],
                index = probe_ids)

 # Expr Dataframe -> Groupby Object -> String -> String -> Dataframe (t-statistic, p-value)
 def compare_groups(expr, groupby, group1, group2):
    """ Compare the given pair groups of the provided groupby object. """
    get_ids = lambda group_name: groupby.get_group(group_name)['sample id']
    return compare_subsets(
                expr, 
                get_ids(group1), 
                get_ids(group2))

 # DataFrame (T-statistic, P-value) -> Side Effects (figure)
 def show_stats(stats):
    """ Show distribution of T-statistic and P-value for each probe site. """
    plt.figure(figsize = (16,3))
    
    plt.subplot(1,3,1)
    bins = np.linspace(-10,10,50)
    stats['t-statistic'].plot(kind = 'hist', bins = bins, color = 'grey')
    stats[stats['p-value'] < 0.01]['t-statistic'].plot(kind = 'hist', bins = bins, alpha = 0.4, color = 'red')
    plt.title('Distribution of T-statistic across all probe sites\n(overlaid in red with values\ncorresponding to P < 0.01)')
    plt.xlabel('T-statistic')
    plt.ylabel('Count')
    plt.xlim(left = -10, right = 10)
    
    plt.subplot(1,3,2)
    stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), color = 'grey');
    plt.xscale('log')
    plt.title('Distribution of P-value across all probe sites')
    plt.xlabel('P-value')
    plt.ylabel('Count')
    ymin,ymax = plt.gca().get_ylim()
    plt.vlines(0.01,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2)
    plt.vlines(0.05,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2)
    plt.text(0.07, 0.9 * ymax, '0.05', color = 'k')
    plt.text(0.007, 0.9 * ymax, '0.01', color = 'k', ha = 'right')

    plt.subplot(1,3,3)
    stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), cumulative = True, normed = 1, color = 'grey');
    plt.xscale('log')
    plt.title('Cumulative distribution of P-value across all probe sites')
    plt.xlabel('P-value')
    plt.vlines(0.01,0,1, color = 'k', linestyle = '--', linewidth = 2)
    plt.vlines(0.05,0,1, color = 'k', linestyle = '--', linewidth = 2)
    plt.text(0.07, 0.9, '0.05', color = 'k')
    plt.text(0.007, 0.9, '0.01', color = 'k', ha = 'right')
    plt.ylim(top = 1)
    plt.ylabel('Fraction of genes');

 # [Stats DataFrame] -> Float -> Dataframe (T-statistic, direction)
 def probes_from_stats(stats_dfs, p_threshold):
    """ Given a list of dataframes containing comparison stats, 
        return dataframe containing probe sites with p-values below threshold.
        Two-columns are returned: t-statistic and direction.
        T-statistic is drawn from first stats dataframe. """
    where_p_is_below_threshold = \
        reduce(lambda x,y: x & y, 
               [stats_df['p-value'] < p_threshold for stats_df in stats_dfs])
    significant_probe_sites = stats_dfs[0][where_p_is_below_threshold][['t-statistic','effect size']]
    significant_probe_sites['direction'] = \
        significant_probe_sites['t-statistic'].map(lambda t: 'increased' if t > 0 else 'decreased')
    return significant_probe_sites

 # {Expr} -> Groupby Object -> String -> String -> DataFrame (t-statistic, direction)
 def get_significant_probes(exprs, groupby, group1, group2, p_threshold = 0.01):
    """ Given a dictionary of expression dataframes, a groupby object, and a pair of groups to compare, 
        return a dataframe containing the probes that show a significant change. """
    return \
        probes_from_stats(
            map(lambda expr: compare_groups(expr, groupby, group1, group2), 
                split_on_first_level(exprs)), 
            p_threshold)

 # DataFrame (t-statistic, direction) -> {n,n_increased,n_decreased}
 def summarize_probes(probes):
    """ Given a set of significant probes, and the direction of their change, return a summary. """
    return dict(
        n = len(probes),
        n_increased = len(probes[probes['direction'] == 'increased']),
        n_decreased = len(probes[probes['direction'] == 'decreased']))

 # DataFrame -> String
 def add_column_level(df,value):
    """ Return new dataframe by adding value as the first level of a compound column name. (for all columns). """
    new = df.copy().T
    new['level_0'] = value
    new = new.set_index('level_0',append = True).swaplevel(0,1)
    return new.T

 # {String: DataFrame} -> DataFrame (with compound column names)
 def combine_dataframes_from_dict(df_dict):
    """ Given a dictionary of dataframes, join them on index, 
        and encode their keys as the first level of a compound column name. """
    new_dfs = map(lambda (label, dataframe): add_column_level(dataframe, label), df_dict.iteritems())
    return reduce(lambda df1,df2:  pd.merge(df1, df2, left_index = True, right_index = True), new_dfs)

 # DataFrame (with compound column names) -> [DataFrame]
 def split_on_first_level(df):
    """ Return list of dataframes by splitting on first element of compound column name. """
    return [df[level_0] for level_0 in df.columns.levels[0]]

 # String -> DataFrame
 def read_expr_csv(filepath):
    """ Given filepath to CSV, return exprs dataframe. 
        (This function just imports the CSV with a compound header and an index.)"""
    return pd.read_csv(filepath,header = [0,1], index_col = 0)

 # [a] -> [a] -> (Int, Int, Int)
 def venn(x,y):
    """ Given two lists, return a triple containing the following: 
         - how many unique elements are found only in first list
         - how many unique elements are found in both lists
         - how many unique elements are found only in second list """
    
    x_set = set(x)
    y_set = set(y)
    return map(len,
               [x_set.difference(y_set),
                x_set.intersection(y_set),
                y_set.difference(x_set)])

	# Download or retrieve dataset in SOFT format
	soft_filepath = "./GSE11882.soft.gz"
	gse = GEOparse.get_GEO(filepath = soft_filepath)


	# Parse metadata for individual samples

	# [String] -> Dict (individual_id, brain_region, gender, age)
	def parse_characteristics(c):
	parse = lambda i: c[i].split(':')[1].strip()
	return dict(
	individual_id = parse(0).replace(' ',''),
	brain_region = parse(1),
	gender = parse(2),
	age = int(parse(3)),
	)

	# Int -> String
	def label_age_group(age):
	if 20 <= age < 39:
	return "20-39"
	elif 40 <= age < 59:
	return "40-59"
	elif 60 <= age < 79:
	return "60-79"
	elif 80 <= age <= 99:
	return "80-99"
	else:
	return ""

	# Dict -> Dict
	def parse_metadata(metadata):
	""" Given a sample's metadata dictionary,
	parse it to return a new dictionary. """
	my_dict = parse_characteristics(metadata['characteristics_ch1'])
	my_dict['age group'] = label_age_group(my_dict['age'])
	my_dict['binary age group'] = '<60' if my_dict['age'] < 60 else '60+'
	my_dict['sample id'] = metadata['geo_accession'][0]
	return my_dict


	# Transform SOFT format into two dataframes: a metadata index, and expression data
	meta = create_metadata_index(parse_metadata, gse)
	exprs = get_expression_data(gse)

	# Save transformed data for later use
	meta.to_csv('/notebooks/tmp/GSE11882_meta.csv')
	exprs.to_csv('/notebooks/tmp/GSE11882_exprs.csv') # 20 seconds


	# Perform some basic analysis

	get_stats = \
	lambda exprtype: \
	compare_groups(
	exprs[exprtype],
	meta.groupby('binary age group'),
	'<60',
	'60+')

	gcrma_stats = get_stats('VALUE')
	plier_stats = get_stats('PLIER')

	show_stats(gcrma_stats)
	show_stats(plier_stats)

	# {Expr DataFrame} -> GroupBy object -> DataFrame (group, n, n_increased, n_decreased)
	def get_young_old_probe_changes_across_groups(exprs, groupby):
	""" Given a groupby object, find the summary of gene changes
	between old and young for each group it contains."""
	summaries = []
	for name, group in groupby:
	summary = \
	summarize_probes(
	get_significant_probes(
	exprs,
	group.groupby('binary age group'),
	'<60',
	'60+'))
	summary['group'] = name
	summaries.append(summary)
	return pd.DataFrame(summaries)

	brain_region_gene_changes = \
	get_young_old_probe_changes_across_groups(
	exprs,
	meta.groupby('brain_region'))

	brain_region_gene_changes \
	[['group','n_decreased','n_increased']] \
	.sort('n_decreased', ascending = False) \
	.plot(kind = 'bar',
	x = 'group',
	stacked = True)

	plt.ylabel('# of gene changes')
	plt.xlabel('Brain region')
	plt.gca().set_xticklabels(['SFG', 'PCG', 'HC', 'EC'], rotation = 0)
	plt.title('# of gene changes across the brain');
	import pandas as pd
	import numpy as np
	import GEOparse
	import matplotlib.pyplot as plt
	import matplotlib.gridspec as gridspec
	%matplotlib inline
	import toolz as tz
	from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
	from scipy.stats import ttest_ind

	# (Dict (gsm.metadata) -> Dict (fields)) -> GSE -> Dataframe (fields)
	def create_metadata_index(parse_func, gse):
	""" Parse and combine the metadata from each sample in the provided gse object."""
	return pd.DataFrame([parse_func(gsm.metadata) for _ , gsm in gse.gsms.iteritems()])

	# Exprs = DataFrame (rows = probe sites, columns = (expression type, sample id))
	# GSE -> Exprs
	def get_expression_data(gse):
	""" Given a gse object, collect each sample's expression data into
	a dictionary of dataframes, with one key-value pair for each column of expression data. """

	table0 = list(gse.gsms.iteritems())[0][1].table
	expression_types = table0.columns[1:].tolist()
	gene_list = table0['ID_REF']
	sample_ids = gse.gsms.keys()
	exprs = {}
	for expr_type in expression_types:
	expr = \
	pd.DataFrame(np.zeros((len(gene_list),
	len(sample_ids))),
	index = gene_list,
	columns = sample_ids)

	for sample_id in sample_ids:
	expr[sample_id] = gse.gsms[sample_id].table[expr_type].values

	exprs[expr_type] = expr

	return combine_dataframes_from_dict(exprs)

	# Expr Dataframe -> [Sample ID] -> [Sample ID] -> Dataframe (t-statistic, p-value)
	def compare_subsets(expr, samples_a, samples_b):
	""" Given an expression dataframe, compare the subsets specified by each list of sample ids. """
	probe_ids = expr.index.values

	data_a = expr[samples_a]
	data_b = expr[samples_b]

	effect_size = (data_b.mean(axis = 1) / data_a.mean(axis = 1)).values

	t,p = ttest_ind(data_b, data_a, axis = 1)
	return pd.DataFrame(
	np.array([effect_size,t,p]).T,
	columns = ['effect size','t-statistic','p-value'],
	index = probe_ids)

	# Expr Dataframe -> Groupby Object -> String -> String -> Dataframe (t-statistic, p-value)
	def compare_groups(expr, groupby, group1, group2):
	""" Compare the given pair groups of the provided groupby object. """
	get_ids = lambda group_name: groupby.get_group(group_name)['sample id']
	return compare_subsets(
	expr,
	get_ids(group1),
	get_ids(group2))

	# DataFrame (T-statistic, P-value) -> Side Effects (figure)
	def show_stats(stats):
	""" Show distribution of T-statistic and P-value for each probe site. """
	plt.figure(figsize = (16,3))

	plt.subplot(1,3,1)
	bins = np.linspace(-10,10,50)
	stats['t-statistic'].plot(kind = 'hist', bins = bins, color = 'grey')
	stats[stats['p-value'] < 0.01]['t-statistic'].plot(kind = 'hist', bins = bins, alpha = 0.4, color = 'red')
	plt.title('Distribution of T-statistic across all probe sites\n(overlaid in red with values\ncorresponding to P < 0.01)')
	plt.xlabel('T-statistic')
	plt.ylabel('Count')
	plt.xlim(left = -10, right = 10)

	plt.subplot(1,3,2)
	stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), color = 'grey');
	plt.xscale('log')
	plt.title('Distribution of P-value across all probe sites')
	plt.xlabel('P-value')
	plt.ylabel('Count')
	ymin,ymax = plt.gca().get_ylim()
	plt.vlines(0.01,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2)
	plt.vlines(0.05,ymin,ymax, color = 'k', linestyle = '--', linewidth = 2)
	plt.text(0.07, 0.9 * ymax, '0.05', color = 'k')
	plt.text(0.007, 0.9 * ymax, '0.01', color = 'k', ha = 'right')

	plt.subplot(1,3,3)
	stats['p-value'].plot(kind = 'hist', bins=np.logspace(-10, 0, 100), cumulative = True, normed = 1, color = 'grey');
	plt.xscale('log')
	plt.title('Cumulative distribution of P-value across all probe sites')
	plt.xlabel('P-value')
	plt.vlines(0.01,0,1, color = 'k', linestyle = '--', linewidth = 2)
	plt.vlines(0.05,0,1, color = 'k', linestyle = '--', linewidth = 2)
	plt.text(0.07, 0.9, '0.05', color = 'k')
	plt.text(0.007, 0.9, '0.01', color = 'k', ha = 'right')
	plt.ylim(top = 1)
	plt.ylabel('Fraction of genes');

	# [Stats DataFrame] -> Float -> Dataframe (T-statistic, direction)
	def probes_from_stats(stats_dfs, p_threshold):
	""" Given a list of dataframes containing comparison stats,
	return dataframe containing probe sites with p-values below threshold.
	Two-columns are returned: t-statistic and direction.
	T-statistic is drawn from first stats dataframe. """
	where_p_is_below_threshold = \
	reduce(lambda x,y: x & y,
	[stats_df['p-value'] < p_threshold for stats_df in stats_dfs])
	significant_probe_sites = stats_dfs[0][where_p_is_below_threshold][['t-statistic','effect size']]
	significant_probe_sites['direction'] = \
	significant_probe_sites['t-statistic'].map(lambda t: 'increased' if t > 0 else 'decreased')
	return significant_probe_sites

	# {Expr} -> Groupby Object -> String -> String -> DataFrame (t-statistic, direction)
	def get_significant_probes(exprs, groupby, group1, group2, p_threshold = 0.01):
	""" Given a dictionary of expression dataframes, a groupby object, and a pair of groups to compare,
	return a dataframe containing the probes that show a significant change. """
	return \
	probes_from_stats(
	map(lambda expr: compare_groups(expr, groupby, group1, group2),
	split_on_first_level(exprs)),
	p_threshold)

	# DataFrame (t-statistic, direction) -> {n,n_increased,n_decreased}
	def summarize_probes(probes):
	""" Given a set of significant probes, and the direction of their change, return a summary. """
	return dict(
	n = len(probes),
	n_increased = len(probes[probes['direction'] == 'increased']),
	n_decreased = len(probes[probes['direction'] == 'decreased']))

	# DataFrame -> String
	def add_column_level(df,value):
	""" Return new dataframe by adding value as the first level of a compound column name. (for all columns). """
	new = df.copy().T
	new['level_0'] = value
	new = new.set_index('level_0',append = True).swaplevel(0,1)
	return new.T

	# {String: DataFrame} -> DataFrame (with compound column names)
	def combine_dataframes_from_dict(df_dict):
	""" Given a dictionary of dataframes, join them on index,
	and encode their keys as the first level of a compound column name. """
	new_dfs = map(lambda (label, dataframe): add_column_level(dataframe, label), df_dict.iteritems())
	return reduce(lambda df1,df2: pd.merge(df1, df2, left_index = True, right_index = True), new_dfs)

	# DataFrame (with compound column names) -> [DataFrame]
	def split_on_first_level(df):
	""" Return list of dataframes by splitting on first element of compound column name. """
	return [df[level_0] for level_0 in df.columns.levels[0]]

	# String -> DataFrame
	def read_expr_csv(filepath):
	""" Given filepath to CSV, return exprs dataframe.
	(This function just imports the CSV with a compound header and an index.)"""
	return pd.read_csv(filepath,header = [0,1], index_col = 0)

	# [a] -> [a] -> (Int, Int, Int)
	def venn(x,y):
	""" Given two lists, return a triple containing the following:
	- how many unique elements are found only in first list
	- how many unique elements are found in both lists
	- how many unique elements are found only in second list """

	x_set = set(x)
	y_set = set(y)
	return map(len,
	[x_set.difference(y_set),
	x_set.intersection(y_set),
	y_set.difference(x_set)])