Last active
March 30, 2020 06:05
-
-
Save potikanond/681c3df241cb82fb23d525e8cb9e3adb to your computer and use it in GitHub Desktop.
Basic gene expression analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mpl_toolkits.mplot3d import Axes3D | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
fig = plt.figure(figsize=(8,6)) | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(gxp['1Hrs'],gxp['2Hrs'],gxp['3Hrs'], c='r', s=40) | |
ax.set_xlabel('1Hrs') | |
ax.set_ylabel('2Hrs') | |
ax.set_zlabel('3Hrs') | |
ax.view_init(45,45) | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig = plt.figure(figsize=(8,6)) | |
ax = fig.add_subplot(111, projection='3d') | |
# plot data with label | |
ax.scatter(gxp['1Hrs'],gxp['2Hrs'],gxp['3Hrs'], | |
c=model.labels_, cmap='rainbow', s=40, ec='black') | |
# plot cluster's CG | |
for cg in model.cluster_centers_: | |
ax.scatter(cg[0],cg[1],cg[2], | |
cmap='rainbow', s=40, marker='^', ec='black') | |
ax.set_xlabel('1Hrs') | |
ax.set_ylabel('2Hrs') | |
ax.set_zlabel('3Hrs') | |
ax.view_init(45,45) | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.spatial.distance import pdist, squareform | |
from scipy.cluster.hierarchy import linkage, dendrogram | |
def bicluster( | |
matrix, | |
gene_cluster_metric='correlation', | |
sample_cluster_metric='euclidean', | |
cluster_method='average', | |
reverse_genes=False, | |
reverse_samples=False): | |
matrix = cluster_genes( | |
matrix, | |
metric=gene_cluster_metric, | |
method=cluster_method, | |
reverse=reverse_genes | |
) | |
matrix = cluster_samples( | |
matrix, | |
metric=sample_cluster_metric, | |
method=cluster_method, | |
reverse=reverse_samples | |
) | |
return matrix | |
def _cluster_ndarray(X, metric, method, reverse): | |
assert isinstance(metric, str) | |
assert isinstance(method, str) | |
assert isinstance(reverse, bool) | |
distxy = squareform(pdist(X, metric=metric)) | |
R = dendrogram(linkage(distxy, method=method), no_plot=True) | |
order_rows = np.int64([int(l) for l in R['ivl']]) | |
if reverse: | |
order_rows = order_rows[::-1] | |
return order_rows | |
def cluster_genes(matrix, metric='correlation', method='average', | |
reverse=False): | |
# note: method = 'average' sometimes causes kernel to crash | |
order_rows = _cluster_ndarray(matrix, metric=metric, method=method, | |
reverse=reverse) | |
matrix = matrix.iloc[order_rows] | |
return matrix | |
def cluster_samples(matrix, metric='euclidean', method='average', | |
reverse=False): | |
# note: method = 'average' sometimes causes kernel to crash | |
# workaround for scipy bug when supplied with a row of NaNs | |
# see: https://github.com/scipy/scipy/issues/5142 | |
valid_rows = (matrix.values != 0).all(axis=1) | |
filtered = matrix.loc[valid_rows] | |
order_cols = _cluster_ndarray(filtered.T, metric=metric, method=method, | |
reverse=reverse) | |
matrix = matrix.iloc[:, order_cols] | |
return matrix |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def center_genes(df,use_median=False): | |
p = df.shape[0] # no. of genes (#rows) | |
n = df.shape[1] # no. of samples (#columns) | |
X = df | |
if use_median: | |
X = X - np.tile(np.median(X,axis=1), (n,1)).T | |
else: | |
X = X - np.tile(np.mean(X,axis=1), (n,1)).T | |
return X |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sse = {} # dictionary | |
for k in range(1,11): | |
kmeans = KMeans(n_clusters=k, max_iter=100) | |
kmeans.fit(gxp_log) | |
sse[k] = kmeans.inertia_ | |
plt.figure() | |
plt.plot(list(sse.keys()), list(sse.values())) | |
plt.xlabel("Number of cluster") | |
plt.ylabel("SSE") | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def filter_variance(df, top): | |
p = df.shape[0] # no. of genes (#rows) | |
n = df.shape[1] # no. of samples (#columns) | |
if top >= p: | |
print('Variance filter has no effect ' | |
'("top" parameter is >= No. of genes).') | |
return df.copy() | |
var = np.var(df,axis=1, ddof=1) | |
total_var = np.sum(var) # total sum of variance | |
a = np.argsort(var) # sort data by specifying index min-to-max | |
a = a[::-1] # max-to-min | |
sel = np.zeros(p, dtype=np.bool_) | |
sel[a[:top]] = True # set selector of 'top' to True | |
lost_p = p - top | |
lost_var = total_var - np.sum(var[sel]) | |
print('Selected the {} most variable genes ' | |
'(excluded {:.2f}% of genes, representing {:.2f}% ' | |
'of total variance).'.format( | |
top, 100*(lost_p / float(p)), | |
100 * (lost_var /total_var))) | |
df = df.loc[sel] | |
return df |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy.cluster.hierarchy import dendrogram | |
def plot_dendrogram(model, **kwargs): | |
# Create linkage matrix and then plot the dendrogram | |
# create the counts of samples under each node | |
counts = np.zeros(model.children_.shape[0]) | |
n_samples = len(model.labels_) | |
for i, merge in enumerate(model.children_): | |
current_count = 0 | |
for child_idx in merge: | |
if child_idx < n_samples: | |
current_count += 1 # leaf node | |
else: | |
current_count += counts[child_idx - n_samples] | |
counts[i] = current_count | |
linkage_matrix = np.column_stack([model.children_, model.distances_, | |
counts]).astype(float) | |
# Plot the corresponding dendrogram | |
dendrogram(linkage_matrix, **kwargs) | |
# plot the top 'p' levels of the dendrogram | |
plt.figure(figsize=(8,4)) | |
plot_dendrogram(model1, truncate_mode='level', p=3) | |
plt.title('Dendrogram (euclidean,ward)') | |
plt.xlabel('Number of points in node') | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
data = { | |
'genes':['AAA','BBB','CCC','DDD','EEE','FFF','GGG','HHH','III','JJJ'], | |
'1Hrs' :[10, 10, 4, 9.5, 4.5, 10.5, 5.0, 2.7, 9.7, 10.2], | |
'2Hrs' :[8, 0, 8.5, 0.5, 8.5, 9, 8.5, 8.7, 2, 1], | |
'3Hrs' :[10, 9, 3, 8.5, 2.5, 12, 11, 2, 9, 9.2] | |
} | |
gxp = pd.DataFrame(data) | |
gxp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment