Skip to content

Instantly share code, notes, and snippets.

@potikanond
Last active March 30, 2020 06:05
Show Gist options
  • Save potikanond/681c3df241cb82fb23d525e8cb9e3adb to your computer and use it in GitHub Desktop.
Save potikanond/681c3df241cb82fb23d525e8cb9e3adb to your computer and use it in GitHub Desktop.
Basic gene expression analysis
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(gxp['1Hrs'],gxp['2Hrs'],gxp['3Hrs'], c='r', s=40)
ax.set_xlabel('1Hrs')
ax.set_ylabel('2Hrs')
ax.set_zlabel('3Hrs')
ax.view_init(45,45)
plt.show()
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
# plot data with label
ax.scatter(gxp['1Hrs'],gxp['2Hrs'],gxp['3Hrs'],
c=model.labels_, cmap='rainbow', s=40, ec='black')
# plot cluster's CG
for cg in model.cluster_centers_:
ax.scatter(cg[0],cg[1],cg[2],
cmap='rainbow', s=40, marker='^', ec='black')
ax.set_xlabel('1Hrs')
ax.set_ylabel('2Hrs')
ax.set_zlabel('3Hrs')
ax.view_init(45,45)
plt.show()
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
def bicluster(
matrix,
gene_cluster_metric='correlation',
sample_cluster_metric='euclidean',
cluster_method='average',
reverse_genes=False,
reverse_samples=False):
matrix = cluster_genes(
matrix,
metric=gene_cluster_metric,
method=cluster_method,
reverse=reverse_genes
)
matrix = cluster_samples(
matrix,
metric=sample_cluster_metric,
method=cluster_method,
reverse=reverse_samples
)
return matrix
def _cluster_ndarray(X, metric, method, reverse):
assert isinstance(metric, str)
assert isinstance(method, str)
assert isinstance(reverse, bool)
distxy = squareform(pdist(X, metric=metric))
R = dendrogram(linkage(distxy, method=method), no_plot=True)
order_rows = np.int64([int(l) for l in R['ivl']])
if reverse:
order_rows = order_rows[::-1]
return order_rows
def cluster_genes(matrix, metric='correlation', method='average',
reverse=False):
# note: method = 'average' sometimes causes kernel to crash
order_rows = _cluster_ndarray(matrix, metric=metric, method=method,
reverse=reverse)
matrix = matrix.iloc[order_rows]
return matrix
def cluster_samples(matrix, metric='euclidean', method='average',
reverse=False):
# note: method = 'average' sometimes causes kernel to crash
# workaround for scipy bug when supplied with a row of NaNs
# see: https://github.com/scipy/scipy/issues/5142
valid_rows = (matrix.values != 0).all(axis=1)
filtered = matrix.loc[valid_rows]
order_cols = _cluster_ndarray(filtered.T, metric=metric, method=method,
reverse=reverse)
matrix = matrix.iloc[:, order_cols]
return matrix
def center_genes(df,use_median=False):
p = df.shape[0] # no. of genes (#rows)
n = df.shape[1] # no. of samples (#columns)
X = df
if use_median:
X = X - np.tile(np.median(X,axis=1), (n,1)).T
else:
X = X - np.tile(np.mean(X,axis=1), (n,1)).T
return X
sse = {} # dictionary
for k in range(1,11):
kmeans = KMeans(n_clusters=k, max_iter=100)
kmeans.fit(gxp_log)
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()
def filter_variance(df, top):
p = df.shape[0] # no. of genes (#rows)
n = df.shape[1] # no. of samples (#columns)
if top >= p:
print('Variance filter has no effect '
'("top" parameter is >= No. of genes).')
return df.copy()
var = np.var(df,axis=1, ddof=1)
total_var = np.sum(var) # total sum of variance
a = np.argsort(var) # sort data by specifying index min-to-max
a = a[::-1] # max-to-min
sel = np.zeros(p, dtype=np.bool_)
sel[a[:top]] = True # set selector of 'top' to True
lost_p = p - top
lost_var = total_var - np.sum(var[sel])
print('Selected the {} most variable genes '
'(excluded {:.2f}% of genes, representing {:.2f}% '
'of total variance).'.format(
top, 100*(lost_p / float(p)),
100 * (lost_var /total_var)))
df = df.loc[sel]
return df
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
# plot the top 'p' levels of the dendrogram
plt.figure(figsize=(8,4))
plot_dendrogram(model1, truncate_mode='level', p=3)
plt.title('Dendrogram (euclidean,ward)')
plt.xlabel('Number of points in node')
plt.show()
import pandas as pd
import numpy as np
data = {
'genes':['AAA','BBB','CCC','DDD','EEE','FFF','GGG','HHH','III','JJJ'],
'1Hrs' :[10, 10, 4, 9.5, 4.5, 10.5, 5.0, 2.7, 9.7, 10.2],
'2Hrs' :[8, 0, 8.5, 0.5, 8.5, 9, 8.5, 8.7, 2, 1],
'3Hrs' :[10, 9, 3, 8.5, 2.5, 12, 11, 2, 9, 9.2]
}
gxp = pd.DataFrame(data)
gxp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment