Last active
May 18, 2019 15:03
-
-
Save mortonjt/f050972c37ebc897c1b1af7b1f7143e6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## import modules | |
import os | |
import copy | |
import time | |
from tqdm import tqdm | |
import numpy as np | |
from skbio.stats.composition import clr_inv as softmax | |
from scipy.stats import spearmanr | |
import datetime | |
import pandas as pd | |
from skbio.stats.composition import ilr_inv, clr_inv, alr_inv, clr, centralize, closure | |
from scipy.sparse import coo_matrix, csr_matrix | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
# see http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html | |
import torch | |
from torch.autograd import Variable | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
from torch.distributions.multinomial import Multinomial | |
from torch.nn.utils import clip_grad_norm | |
%matplotlib inline | |
# create toy example | |
def bimodal(num_x, num_y, means=(-1, 1), sigma=0.1): | |
beta = state.normal(0, 1, size=(2, num_y)) | |
beta = np.sort(beta, axis=1) | |
X = np.vstack((np.ones(num_x), | |
np.linspace(means[0], means[1], num_x))).T | |
Q = np.tanh(state.normal(X @ beta, sigma)) | |
return Q | |
num_microbes = 100 | |
num_metabolites = 100 | |
num_samples=100 | |
latent_dim=2 | |
low=-1 | |
high=1 | |
microbe_total=300 | |
metabolite_total=900 | |
uB=0 | |
sigmaB=1 | |
sigmaQ=2 | |
uU=0 | |
sigmaU=0.01 | |
uV=0 | |
sigmaV=0.01 | |
s = 0.2 | |
seed=1 | |
eUun = 0.2 | |
eVun = 0.2 | |
state = np.random.RandomState(seed) | |
# only have two coefficients | |
#microbes = state.multivariate_normal( | |
# mean=np.ones(num_microbes-1), cov=np.diag([sigmaQ] * np.exp(state.randn(num_microbes-1))), | |
# size=num_samples) | |
microbes = bimodal(num_samples, num_microbes, means=(-3, 3)) | |
microbes = clr_inv(microbes) | |
eUmain = bimodal(num_microbes, latent_dim, means=(-3, 3)) | |
eVmain = bimodal(num_metabolites, latent_dim, means=(-3, 3)).T | |
eUmain = - eUmain.mean(axis=0) + eUmain - eUmain.mean(axis=1).reshape(-1, 1) | |
eVmain = - eVmain.mean(axis=0) + eVmain - eVmain.mean(axis=1).reshape(-1, 1) | |
eUbias = state.normal( | |
uU, sigmaU, size=(num_microbes, 1)) | |
eVbias = state.normal( | |
uV, sigmaV, size=(1, num_metabolites)) | |
# TODO: force eUmain and eVmain to be centered? | |
U_ = np.hstack( | |
(np.ones((num_microbes, 1)), eUbias, eUmain)) | |
V_ = np.vstack( | |
(eVbias, np.ones((1, num_metabolites)), eVmain)) | |
phi = U_ @ V_ | |
microbe_counts = np.zeros((num_samples, num_microbes)) | |
metabolite_counts = np.zeros((num_samples, num_metabolites)) | |
n1 = microbe_total | |
n2 = metabolite_total // microbe_total | |
for n in range(num_samples): | |
u = state.normal(eUmain, eUun) | |
v = state.normal(eVmain, eVun) | |
ubias = state.normal(eUbias, eUun) | |
vbias = state.normal(eVbias, eVun) | |
u_ = np.hstack( | |
(np.ones((num_microbes, 1)), ubias, u)) | |
v_ = np.vstack( | |
(vbias, np.ones((1, num_metabolites)), v)) | |
#phi = np.hstack((np.zeros((num_microbes, 1)), u_ @ v_)) | |
probs = softmax(u_ @ v_) | |
otu = state.multinomial(n1, microbes[n, :]) | |
for _ in range(microbe_total): | |
i = state.choice(np.arange(num_microbes), p=microbes[n, :]) | |
metabolite_counts[n, :] += state.multinomial(n2, probs[i, :]) | |
microbe_counts[n, :]+= state.multinomial(microbe_total, microbes[n]) | |
# visualize the simulations - just for fun | |
sns.heatmap(eUmain @ eVmain, cmap='seismic') | |
sns.heatmap(u_ @ v_, cmap='seismic') | |
sns.heatmap(microbe_counts) | |
sns.heatmap(metabolite_counts) | |
## Create mmvec classes | |
class GaussianEmbedding(torch.nn.Module): | |
def __init__(self, in_features, out_features): | |
""" | |
In the constructor we instantiate two nn.Linear modules and assign them as | |
member variables. | |
""" | |
super(GaussianEmbedding, self).__init__() | |
self.embedding = torch.nn.Embedding(in_features, out_features) | |
self.bias = torch.nn.Embedding(in_features, 1) | |
self.embedding_var = torch.nn.Embedding(in_features, out_features) | |
self.bias_var = torch.nn.Embedding(in_features, 1) | |
self.in_features = in_features | |
self.out_features = out_features | |
def _divergence(self, mu, logvar): | |
return 0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) | |
def divergence(self): | |
""" Computes the KL divergence between posterior and prior. """ | |
w = self._divergence(self.embedding.weight, self.embedding_var.weight) | |
b = self._divergence(self.bias.weight, self.bias_var.weight) | |
return w + b | |
def reparameterize(self, mu, logvar): | |
""" Samples from the posterior distribution via | |
reparameterization gradients""" | |
std = torch.exp(0.5 * logvar) | |
eps = torch.randn_like(std) | |
return mu + eps*std | |
def forward(self, x): | |
""" | |
In the forward function we accept a Tensor of input data and we must return | |
a Tensor of output data. We can use Modules defined in the constructor as | |
well as arbitrary operators on Tensors. | |
""" | |
embeds = self.reparameterize( | |
self.embedding(x), | |
self.embedding_var(x) | |
) | |
biases = self.reparameterize( | |
self.bias(x), | |
self.bias_var(x) | |
) | |
pred = embeds + biases | |
return pred | |
class GaussianDecoder(torch.nn.Module): | |
def __init__(self, in_features, out_features): | |
""" | |
In the constructor we instantiate two nn.Linear modules and assign them as | |
member variables. | |
""" | |
super(GaussianDecoder, self).__init__() | |
self.mean = torch.nn.Linear(in_features, out_features) | |
self.var = torch.nn.Linear(in_features, out_features) | |
self.in_features = in_features | |
self.out_features = out_features | |
def reparameterize(self, mu, logvar, mc_samples=10): | |
""" Samples from the posterior distribution via | |
reparameterization gradients""" | |
std = torch.exp(0.5 * logvar) | |
eps = torch.randn_like(std) | |
return mu + eps*std | |
def _divergence(self, mu, logvar): | |
return 0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) | |
def divergence(self): | |
""" Computes the KL divergence between posterior and prior. """ | |
# see Appendix B from VAE paper: | |
# Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 | |
# https://arxiv.org/abs/1312.6114 | |
w = self._divergence(self.mean.weight, self.var.weight) | |
b = self._divergence(self.mean.bias, self.var.bias) | |
return w + b | |
def forward(self, x): | |
""" | |
In the forward function we accept a Tensor of input data and we must return | |
a Tensor of output data. We can use Modules defined in the constructor as | |
well as arbitrary operators on Tensors. | |
""" | |
pred = self.reparameterize( | |
self.mean(x), | |
self.var(x) | |
) | |
return pred | |
class MMvec(nn.Module): | |
def __init__(self, num_samples, num_microbes, num_metabolites, microbe_total, | |
latent_dim, batch_size=10, subsample_size=100, mc_samples=10, | |
device='cpu'): | |
super(MMvec, self).__init__() | |
self.num_microbes = num_microbes | |
self.num_metabolites = num_metabolites | |
self.num_samples = num_samples | |
self.device = device | |
self.batch_size = batch_size | |
self.subsample_size = subsample_size | |
self.mc_samples = mc_samples | |
self.microbe_total = microbe_total | |
self.encoder = GaussianEmbedding(num_microbes, latent_dim) | |
self.decoder = GaussianDecoder(latent_dim, num_metabolites) | |
def forward(self, x): | |
code = self.encoder(x) | |
log_probs = self.decoder(code) | |
#zeros = torch.zeros(self.batch_size * self.subsample_size, 1) | |
#log_probs = torch.cat((zeros, alrs), dim=1) | |
return log_probs | |
def loss(self, pred, obs): | |
""" Computes the loss function to be minimized. """ | |
mean_like = torch.zeros(mc_samples, device=self.device) | |
kld = self.encoder.divergence() + self.decoder.divergence() | |
n = self.microbe_total * self.num_samples | |
likelihood = n * torch.mean(Multinomial(logits=pred).log_prob(obs)) | |
elbo = kld + likelihood | |
#elbo = likelihood | |
return -elbo, kld, likelihood | |
def get_batch(X, Y, i, subsample_size, batch_size): | |
""" Retrieves minibatch | |
Parameters | |
---------- | |
X : scipy.sparse.csr_matrix | |
Input sparse matrix of abundances | |
Y : np.array | |
Output dense matrix of abundances | |
i : int | |
Sample index | |
subsample_size : int | |
Number of sequences to randomly draw per iteration | |
batch_size : int | |
Number of samples to load per minibatch | |
TODO | |
---- | |
- It will be worth offloading this work to the torch.data.DataLoader class. | |
One advantage is that the GPU work can be done in parallel | |
to preparing the data for transfer. This could yield at least | |
a 2x speedup. | |
""" | |
Xs = [] | |
Ys = [] | |
for n in range(batch_size): | |
k = (i + n) % Y.shape[0] | |
row = X.getrow(k) | |
cnts = np.hstack([row.data[row.indptr[i]:row.indptr[i+1]] | |
for i in range(len(row.indptr)-1)]) | |
feats = np.hstack([row.indices[row.indptr[i]:row.indptr[i+1]] | |
for i in range(len(row.indptr)-1)]) | |
inp = np.random.choice(feats, p=cnts/np.sum(cnts), size=subsample_size) | |
Xs.append(inp) | |
Ys.append(Y[k]) | |
Xs = np.hstack(Xs) | |
Ys = np.repeat(np.vstack(Ys), subsample_size, axis=0) | |
return torch.from_numpy(Xs).long(), torch.from_numpy(Ys).float() | |
## preprocessing for mmvec | |
trainX = csr_matrix(microbe_counts) | |
trainY = metabolite_counts | |
d1 = microbe_counts.shape[1] | |
d2 = metabolite_counts.shape[1] | |
epochs=100 | |
mc_samples=1 | |
beta1=0.8 | |
beta2=0.9 | |
gamma=0.1 | |
step_size=20 | |
batch_size=10 | |
subsample_size=500 | |
microbe_total = microbe_counts.sum() | |
# optimization parameters | |
learning_rate = 1e-1 | |
clipnorm = 100 | |
iterations = 1000 | |
# log path | |
basename = "logdir" | |
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S") | |
save_path = "_".join([basename, suffix]) | |
## actually fit the model | |
model = MMvec(num_samples=num_samples, num_microbes=num_microbes, num_metabolites=num_metabolites, | |
microbe_total=microbe_total, latent_dim=latent_dim, batch_size=batch_size, | |
subsample_size=subsample_size, | |
device='cpu') | |
optimizer = optim.Adam(model.parameters(), betas=(beta1, beta2), | |
lr=learning_rate) | |
scheduler = torch.optim.lr_scheduler.StepLR( | |
optimizer, step_size=step_size, gamma=gamma) | |
best_loss = np.inf | |
losses = [] | |
klds = [] | |
likes = [] | |
errs = [] | |
for ep in tqdm(range(0, epochs)): | |
model.train() | |
scheduler.step() | |
# 2 rounds so that both matrices can be iterated | |
for _ in range(2): | |
for i in range(0, num_samples, model.batch_size): | |
optimizer.zero_grad() | |
inp, out = get_batch(trainX, trainY, i % num_samples, | |
model.subsample_size, model.batch_size) | |
pred = model.forward(inp) | |
loss, kld, like = model.loss(pred, out) | |
err = torch.mean(torch.abs(F.softmax(pred, dim=1) * metabolite_total - out)) | |
#loss.backward() | |
loss.backward() | |
errs.append(err.item()) | |
losses.append(loss.item()) | |
klds.append(kld.item()) | |
likes.append(like.item()) | |
optimizer.step() | |
## diagnostics for the fit | |
plt.plot(errs) | |
plt.xlabel('iterations', fontsize=14) | |
plt.ylabel('errors', fontsize=14) | |
plt.plot(-np.array(likes)) | |
plt.xlabel('iterations', fontsize=14) | |
plt.ylabel('likelihood', fontsize=14) | |
plt.yscale('log') | |
plt.plot(klds) | |
plt.xlabel('iterations', fontsize=14) | |
plt.ylabel('KL', fontsize=14) | |
plt.plot(losses) | |
plt.xlabel('iterations', fontsize=14) | |
plt.ylabel('elbo', fontsize=14) | |
plt.yscale('log') | |
u = np.array(model.encoder.embedding.weight.detach()) | |
v = np.array(model.decoder.mean.weight.detach()) | |
ubias = np.array(model.encoder.bias.weight.detach()) | |
vbias = np.array(model.decoder.mean.bias.detach()) | |
def kl(mu, logvar): | |
return 1 + logvar - mu**2 - np.exp(logvar) | |
sns.distplot(np.ravel(eUmain)) | |
sns.distplot(np.ravel(u)) | |
from scipy.spatial.distance import pdist | |
plt.scatter(pdist(eUmain), pdist(u)) | |
mu = np.ravel(model.encoder.embedding.weight.detach()) | |
logv = np.ravel(model.encoder.embedding_var.weight.detach()) | |
print(kl(mu, logv).sum()) | |
sns.distplot(kl(mu, logv)) | |
mu = np.ravel(model.encoder.bias.weight.detach()) | |
logv = np.ravel(model.encoder.bias_var.weight.detach()) | |
print(kl(mu, logv).sum()) | |
sns.distplot(kl(mu, logv)) | |
mu = np.ravel(model.decoder.mean.weight.detach()) | |
logv = np.ravel(model.decoder.var.weight.detach()) | |
print(kl(mu, logv).sum()) | |
sns.distplot(kl(mu, logv)) | |
mu = np.ravel(model.decoder.mean.bias.detach()) | |
logv = np.ravel(model.decoder.var.bias.detach()) | |
print(kl(mu, logv).sum()) | |
sns.distplot(kl(mu, logv)) | |
sns.distplot(np.ravel(eVmain)) | |
sns.distplot(np.ravel(v)) | |
plt.scatter(pdist(eVmain.T), pdist(v)) | |
modelU = np.hstack( | |
(np.ones((u.shape[0], 1)), ubias, u)) | |
modelV = np.hstack( | |
(vbias.reshape(-1, 1), np.ones((v.shape[0], 1)), v)).T | |
exp = phi - phi.mean(axis=1).reshape(-1, 1) | |
exp = exp - exp.mean(axis=0) | |
res = modelU @ modelV | |
res = res - res.mean(axis=1).reshape(-1, 1) | |
res = res - res.mean(axis=0) | |
sns.distplot(exp.ravel()) | |
sns.distplot(res.ravel()) | |
plt.scatter(exp.ravel(), res.ravel()) | |
plt.xlabel('simulated ranks') | |
plt.ylabel('predicted ranks') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment