dela3499 · January 12, 2016 01:02
diff --git a/cmap_ks_test.py b/cmap_ks_test.py
 # type PrerankedProfile = Series(Index = Integer, Probe_ID = String, Rank = Integer)

 # DataFrame -> List PrerankedProfile
 def cmap2sortedseries(df):
    return [series.order(ascending = True).reset_index() for series in df2series(df)]

 # PrerankedProfile -> List a -> Float
 def gsea_preranked(sorted_series, tag_list):
    """ Apply KS test.
        ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles)
    """
    n_tags = float(len(tag_list))
    relative_ranks = (sorted_series[sorted_series.ix[:,0].isin(tag_list)].index + 1)/ float(len(sorted_series))
    relative_tag_ranks = np.linspace(0, 1, n_tags + 1)
    a = max(relative_tag_ranks[1:] - relative_ranks)
    b = max(relative_ranks - relative_tag_ranks[:-1])
    return if_(a > b, a, -b)
    
 # Example
 cmap = pd.read_csv('/path/to/rankMatrix.txt', sep = '\t', index_col = 0)
 instances = cmap2sortedseries(cmap)
 list_of_probes = ['207199_at', '201058_s_at'] # example probe list to find enrichment for
 enrichment_value = gsea_preranked(instances[0], list_of_probes)
diff --git a/cmap_ks_test_fast.py b/cmap_ks_test_fast.py
 # Faster version (about 160 times faster than above) - vectorized

 # Matrix -> Matrix
 sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat)

 cmap_relative_ranks = cmap.values / float(cmap.shape[0])

 # PrerankedProfile -> List a -> Float
 def gsea_preranked2(tag_indices):
    """ Apply KS test.
        ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles)
    """
    n_tags = float(len(tag_indices))
    sorted_relative_ranks = sort_columns(cmap_relative_ranks[tag_indices,:])
    relative_tag_ranks = list2col(np.linspace(0, 1, n_tags + 1))
    a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0)
    b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0)
    return tz.thread_last(
        [a,b],
        np.vstack, 
        (np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0))
diff --git a/cmap_ks_test_fast2.py b/cmap_ks_test_fast2.py
 # DataFrame[Row = Probe, Column = Profile]
 def get_cmap():
    """ Return dataframe with cmap data. """
    return  pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1)

 # DataFrame[Row = Probe, Column = Profile]
 def get_relative_cmap():
    """ Return dataframe with cmap data as relative ranks. """
    cmap = pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1)
    return  cmap / float(cmap.shape[0])
    
 # DataFrame
 def get_cmap_metadata():
    """ Return DataFrame with CMAP metadata for each instance. """
    return pd.read_csv('/home/bioage/scoring/CMAP test info/cmap_metadata.csv', delimiter = '\t', index_col = 0).dropna(how = 'all', axis = 0)

 # String -> Dict
 def read_msigdb_probesets(filename = '/home/bioage/scoring/msigdb_v5.0/probeset.json'):
    """ Read newline-delimited json file with probesets into a
        dictionary, where each record is a two-element list. """
    return tz.thread_last(
        filename,
        read_file,
        split_lines,
        (map, json.loads),
        dict)

 # Matrix -> Matrix
 sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat)

 # DataFrame[row = probe, col = drug, val = relative rank] -> List String -> Float
 def gsea_preranked(cmap_relative_ranks, probes):
    """ Apply KS test. Provide cmap containing relative ranks.
        ex: scores = map(lambda profile: gsea_preranked(cmap, up_probe_indices), profiles)
    """
    n_probes = float(len(probes))
    sorted_relative_ranks = sort_columns(cmap_relative_ranks.loc[probes].values)
    relative_tag_ranks = list2col(np.linspace(0, 1, n_probes   + 1))
    a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0)
    b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0)
    return tz.thread_last(
        [a,b],
        np.vstack,
        (np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0)) # I can vectorize this better.
        
 # Ex. 
 cmap = get_relative_cmap() # 30s
 sigs = read_msigdb_probesets()
 senescence_signatures = sorted([(name, probes) for name, probes in sigs.items() if 'senesc' in name.lower()], key = lambda (_, b): len(b)) # 7 signatures

 # Step 2: Get enrichment scores for each signature across all CMAP instances
 senescence_drug_scores = [gsea_preranked(cmap, probes) for _, probes in senescence_signatures]
 senescence_results = pd.DataFrame(
    senescence_drug_scores,
    index = map(head, senescence_signatures), 
    columns = cmap.columns)
	# type PrerankedProfile = Series(Index = Integer, Probe_ID = String, Rank = Integer)

	# DataFrame -> List PrerankedProfile
	def cmap2sortedseries(df):
	return [series.order(ascending = True).reset_index() for series in df2series(df)]

	# PrerankedProfile -> List a -> Float
	def gsea_preranked(sorted_series, tag_list):
	""" Apply KS test.
	ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles)
	"""
	n_tags = float(len(tag_list))
	relative_ranks = (sorted_series[sorted_series.ix[:,0].isin(tag_list)].index + 1)/ float(len(sorted_series))
	relative_tag_ranks = np.linspace(0, 1, n_tags + 1)
	a = max(relative_tag_ranks[1:] - relative_ranks)
	b = max(relative_ranks - relative_tag_ranks[:-1])
	return if_(a > b, a, -b)

	# Example
	cmap = pd.read_csv('/path/to/rankMatrix.txt', sep = '\t', index_col = 0)
	instances = cmap2sortedseries(cmap)
	list_of_probes = ['207199_at', '201058_s_at'] # example probe list to find enrichment for
	enrichment_value = gsea_preranked(instances[0], list_of_probes)
	# Faster version (about 160 times faster than above) - vectorized

	# Matrix -> Matrix
	sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat)

	cmap_relative_ranks = cmap.values / float(cmap.shape[0])

	# PrerankedProfile -> List a -> Float
	def gsea_preranked2(tag_indices):
	""" Apply KS test.
	ex: scores = map(lambda profile: gsea_preranked(profile, up_probes), profiles)
	"""
	n_tags = float(len(tag_indices))
	sorted_relative_ranks = sort_columns(cmap_relative_ranks[tag_indices,:])
	relative_tag_ranks = list2col(np.linspace(0, 1, n_tags + 1))
	a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0)
	b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0)
	return tz.thread_last(
	[a,b],
	np.vstack,
	(np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0))
	# DataFrame[Row = Probe, Column = Profile]
	def get_cmap():
	""" Return dataframe with cmap data. """
	return pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1)

	# DataFrame[Row = Probe, Column = Profile]
	def get_relative_cmap():
	""" Return dataframe with cmap data as relative ranks. """
	cmap = pd.read_csv('/home/bioage/scoring/CMAP test info/rankMatrix.txt', sep = '\t', index_col = 0).dropna(how = 'all', axis = 1)
	return cmap / float(cmap.shape[0])

	# DataFrame
	def get_cmap_metadata():
	""" Return DataFrame with CMAP metadata for each instance. """
	return pd.read_csv('/home/bioage/scoring/CMAP test info/cmap_metadata.csv', delimiter = '\t', index_col = 0).dropna(how = 'all', axis = 0)

	# String -> Dict
	def read_msigdb_probesets(filename = '/home/bioage/scoring/msigdb_v5.0/probeset.json'):
	""" Read newline-delimited json file with probesets into a
	dictionary, where each record is a two-element list. """
	return tz.thread_last(
	filename,
	read_file,
	split_lines,
	(map, json.loads),
	dict)

	# Matrix -> Matrix
	sort_columns = lambda mat: np.apply_along_axis(np.sort, 0, mat)

	# DataFrame[row = probe, col = drug, val = relative rank] -> List String -> Float
	def gsea_preranked(cmap_relative_ranks, probes):
	""" Apply KS test. Provide cmap containing relative ranks.
	ex: scores = map(lambda profile: gsea_preranked(cmap, up_probe_indices), profiles)
	"""
	n_probes = float(len(probes))
	sorted_relative_ranks = sort_columns(cmap_relative_ranks.loc[probes].values)
	relative_tag_ranks = list2col(np.linspace(0, 1, n_probes + 1))
	a = np.max(relative_tag_ranks[1:] - sorted_relative_ranks, axis = 0)
	b = np.max(sorted_relative_ranks - relative_tag_ranks[:-1], axis = 0)
	return tz.thread_last(
	[a,b],
	np.vstack,
	(np.apply_along_axis, lambda (ai,bi): if_(ai > bi, ai, -bi), 0)) # I can vectorize this better.

	# Ex.
	cmap = get_relative_cmap() # 30s
	sigs = read_msigdb_probesets()
	senescence_signatures = sorted([(name, probes) for name, probes in sigs.items() if 'senesc' in name.lower()], key = lambda (_, b): len(b)) # 7 signatures

	# Step 2: Get enrichment scores for each signature across all CMAP instances
	senescence_drug_scores = [gsea_preranked(cmap, probes) for _, probes in senescence_signatures]
	senescence_results = pd.DataFrame(
	senescence_drug_scores,
	index = map(head, senescence_signatures),
	columns = cmap.columns)