Created
June 13, 2020 22:56
-
-
Save chao1224/ca505bdcf5064ff9fe1abc9c45acbf80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''This is demo scripts for running n_gram_graph on delaney.''' | |
from __future__ import print_function | |
import argparse | |
import os | |
import numpy as np | |
import json | |
import torch | |
import torch.nn as nn | |
from torch.autograd import Variable | |
import torch.optim as optim | |
from torch.utils.data import Dataset, DataLoader | |
import torch.backends.cudnn as cudnn | |
import n_gram_graph as ngg | |
from n_gram_graph.embedding.node_embedding import CBoW, train, test | |
from n_gram_graph.model.xgboost_regression import XGBoostRegression | |
from n_gram_graph.model.random_forest_regression import RandomForestRegression | |
from n_gram_graph.util import * | |
from n_gram_graph.dataset_specification import dataset2task_list | |
def node_embedding_get_data(data_path, padding_size): | |
data = np.load(data_path) | |
print(data.keys()) | |
print(data_path) | |
adjacent_matrix_list = data['adjacent_matrix_list'] | |
node_attribute_matrix_list = data['node_attribute_matrix_list'] | |
molecule_num = adjacent_matrix_list.shape[0] | |
print('molecule num\t', molecule_num) | |
X_data = [] | |
Y_label_list = [] | |
print('adjacent_matrix_list shape: {}\tnode_attribute_matrix_list shape: {}'.format(adjacent_matrix_list.shape, node_attribute_matrix_list.shape)) | |
for adjacent_matrix, node_attribute_matrix in zip(adjacent_matrix_list, node_attribute_matrix_list): | |
assert len(adjacent_matrix) == max_atom_num | |
assert len(node_attribute_matrix) == max_atom_num | |
for i in range(max_atom_num): | |
if sum(adjacent_matrix[i]) == 0: | |
break | |
x_temp = np.zeros((padding_size, feature_num)) | |
cnt = 0 | |
for j in range(max_atom_num): | |
if adjacent_matrix[i][j] == 1: | |
x_temp[cnt] = node_attribute_matrix[j] | |
cnt += 1 | |
x_temp = np.array(x_temp) | |
y_temp = [] | |
atom_feat = node_attribute_matrix[i] | |
for s in segmentation_list: | |
y_temp.append(atom_feat[s].argmax()) | |
X_data.append(x_temp) | |
Y_label_list.append(y_temp) | |
X_data = np.array(X_data) | |
Y_label_list = np.array(Y_label_list) | |
return X_data, Y_label_list | |
class NodeEmbeddingGraphDataset(Dataset): | |
def __init__(self, mode, K_list, padding_size, segmentation_list): | |
self.X_data, self.Y_label_list = [], [] | |
for i in K_list: | |
data_path = './datasets/{}/{}_graph.npz'.format(mode, i) | |
X_data, Y_label_list = node_embedding_get_data(data_path=data_path, padding_size=padding_size) | |
self.X_data.extend(X_data) | |
self.Y_label_list.extend(Y_label_list) | |
self.X_data = np.array(self.X_data) | |
self.Y_label_list = np.array(self.Y_label_list) | |
print('data size: ', self.X_data.shape, '\tlabel size: ', self.Y_label_list.shape) | |
self.segmentation_list = segmentation_list | |
def __len__(self): | |
return len(self.X_data) | |
def __getitem__(self, idx): | |
x_data = self.X_data[idx] | |
y_label_list = self.Y_label_list[idx] | |
x_data = torch.from_numpy(x_data) | |
y_label_list = torch.from_numpy(y_label_list) | |
return x_data, y_label_list | |
def node_embedding_train(): | |
criterion = nn.CrossEntropyLoss() | |
model.train() | |
optimal_loss = 1e7 | |
for epoch in range(epochs): | |
train_loss = [] | |
for batch_id, (x_data, y_actual) in enumerate(train_dataloader): | |
x_data = Variable(x_data).float() | |
y_actual = Variable(y_actual).long() | |
if torch.cuda.is_available(): | |
x_data = x_data.cuda() | |
y_actual = y_actual.cuda() | |
optimizer.zero_grad() | |
y_predict = model(x_data) | |
loss = 0 | |
for i in range(segmentation_num): | |
y_true, y_pred = y_actual[..., i], y_predict[i] | |
temp_loss = criterion(y_pred, y_true) | |
loss += temp_loss | |
loss.backward() | |
optimizer.step() | |
train_loss.append(loss.item()) | |
train_loss = np.mean(train_loss) | |
print('epoch: {}\tloss is: {}'.format(epoch, train_loss)) | |
if train_loss < optimal_loss: | |
optimal_loss = train_loss | |
print('Saving model at epoch {}\toptimal loss is {}.'.format(epoch, optimal_loss)) | |
torch.save(model.state_dict(), weight_file) | |
print('For random dimension as {}.'.format(embedding_dimension)) | |
return | |
def node_embedding_test(dataloader): | |
model.eval() | |
accuracy, total = 0, 0 | |
for batch_id, (x_data, y_actual) in enumerate(dataloader): | |
x_data = Variable(x_data).float() | |
y_actual = Variable(y_actual).long() | |
if torch.cuda.is_available(): | |
x_data = x_data.cuda() | |
y_actual = y_actual.cuda() | |
y_predict = model(x_data) | |
for i in range(segmentation_num): | |
y_true, y_pred = y_actual[..., i].cpu().data.numpy(), y_predict[i].cpu().data.numpy() | |
y_pred = y_pred.argmax(1) | |
accuracy += np.sum(y_true == y_pred) | |
total += y_pred.shape[0] | |
accuracy = 1. * accuracy / total | |
print('Accuracy: {}'.format(accuracy)) | |
print('For random dimension as {}.'.format(embedding_dimension)) | |
return | |
def graph_embedding_get_data(data_path): | |
data = np.load(data_path) | |
print(data.keys()) | |
adjacent_matrix_list = data['adjacent_matrix_list'] | |
distance_matrix_list = data['distance_matrix_list'] | |
bond_attribute_matrix_list = data['bond_attribute_matrix_list'] | |
node_attribute_matrix_list = data['node_attribute_matrix_list'] | |
kwargs = {} | |
kwargs['label_name'] = data['label_name'] | |
return adjacent_matrix_list, distance_matrix_list, bond_attribute_matrix_list,\ | |
node_attribute_matrix_list, kwargs | |
class GraphEmbeddingGraphDataset(Dataset): | |
def __init__(self, node_attribute_matrix_list, adjacent_matrix_list, distance_matrix_list): | |
self.node_attribute_matrix_list = node_attribute_matrix_list | |
self.adjacent_matrix_list = adjacent_matrix_list | |
self.distance_matrix_list = distance_matrix_list | |
def __len__(self): | |
return len(self.node_attribute_matrix_list) | |
def __getitem__(self, idx): | |
node_attribute_matrix = torch.from_numpy(self.node_attribute_matrix_list[idx]) | |
adjacent_matrix = torch.from_numpy(self.adjacent_matrix_list[idx]) | |
distance_matrix = torch.from_numpy(self.distance_matrix_list[idx]) | |
return node_attribute_matrix, adjacent_matrix, distance_matrix | |
def graph_embedding_get_walk_representation(dataloader): | |
X_embed = [] | |
embedded_graph_matrix_list = [] | |
for batch_id, (node_attribute_matrix, adjacent_matrix, distance_matrix) in enumerate(dataloader): | |
node_attribute_matrix = Variable(node_attribute_matrix).float() | |
adjacent_matrix = Variable(adjacent_matrix).float() | |
distance_matrix = Variable(distance_matrix).float() | |
if torch.cuda.is_available(): | |
node_attribute_matrix = node_attribute_matrix.cuda() | |
adjacent_matrix = adjacent_matrix.cuda() | |
distance_matrix = distance_matrix.cuda() | |
tilde_node_attribute_matrix = model.embeddings(node_attribute_matrix) | |
walk = tilde_node_attribute_matrix | |
v1 = torch.sum(walk, dim=1) | |
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix | |
v2 = torch.sum(walk, dim=1) | |
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix | |
v3 = torch.sum(walk, dim=1) | |
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix | |
v4 = torch.sum(walk, dim=1) | |
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix | |
v5 = torch.sum(walk, dim=1) | |
walk = torch.bmm(adjacent_matrix, walk) * tilde_node_attribute_matrix | |
v6 = torch.sum(walk, dim=1) | |
embedded_graph_matrix = torch.stack([v1, v2, v3, v4, v5, v6], dim=1) | |
if torch.cuda.is_available(): | |
tilde_node_attribute_matrix = tilde_node_attribute_matrix.cpu() | |
embedded_graph_matrix = embedded_graph_matrix.cpu() | |
X_embed.extend(tilde_node_attribute_matrix.data.numpy()) | |
embedded_graph_matrix_list.extend(embedded_graph_matrix.data.numpy()) | |
embedded_node_matrix_list = np.array(X_embed) | |
embedded_graph_matrix_list = np.array(embedded_graph_matrix_list) | |
print('embedded_node_matrix_list: ', embedded_node_matrix_list.shape) | |
print('embedded_graph_matrix_list shape: {}'.format(embedded_graph_matrix_list.shape)) | |
return embedded_node_matrix_list, embedded_graph_matrix_list | |
def run_n_gram_xgb(): | |
with open(config_json_file, 'r') as f: | |
conf = json.load(f) | |
label_name_list = [label_name] | |
print('label_name_list ', label_name_list) | |
test_index = [running_index] | |
train_index = filter(lambda x: x not in test_index, np.arange(5)) | |
train_file_list = file_list[train_index] | |
test_file_list = file_list[test_index] | |
print('train files ', train_file_list) | |
print('test files ', test_file_list) | |
X_train, y_train = extract_feature_and_label_npy(train_file_list, | |
feature_name='embedded_graph_matrix_list', | |
label_name_list=label_name_list, | |
n_gram_num=n_gram_num) | |
X_test, y_test = extract_feature_and_label_npy(test_file_list, | |
feature_name='embedded_graph_matrix_list', | |
label_name_list=label_name_list, | |
n_gram_num=n_gram_num) | |
print('done data preparation') | |
task = XGBoostRegression(conf=conf) | |
task.train_and_predict(X_train, y_train, X_test, y_test, weight_file) | |
task.eval_with_existing(X_train, y_train, X_test, y_test, weight_file) | |
y_pred_on_test = task.predict_with_existing(X_test, weight_file) | |
np.savez('output_on_test', y_test=y_test, y_pred=y_pred_on_test) | |
return | |
def run_n_gram_rf(): | |
with open(config_json_file, 'r') as f: | |
conf = json.load(f) | |
label_name_list = [label_name] | |
test_index = [running_index] | |
train_index = filter(lambda x: x not in test_index, np.arange(5)) | |
train_file_list = file_list[train_index] | |
test_file_list = file_list[test_index] | |
X_train, y_train = extract_feature_and_label_npy(train_file_list, | |
feature_name='embedded_graph_matrix_list', | |
label_name_list=label_name_list, | |
n_gram_num=n_gram_num) | |
X_test, y_test = extract_feature_and_label_npy(test_file_list, | |
feature_name='embedded_graph_matrix_list', | |
label_name_list=label_name_list, | |
n_gram_num=n_gram_num) | |
print('done data preparation') | |
task = RandomForestRegression(conf=conf) | |
task.train_and_predict(X_train, y_train, X_test, y_test, weight_file) | |
task.eval_with_existing(X_train, y_train, X_test, y_test, weight_file) | |
y_pred_on_test = task.predict_with_existing(X_test, weight_file) | |
np.savez('output_on_test', y_test=y_test, y_pred=y_pred_on_test) | |
return | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--task', type=str, default='delaney') | |
parser.add_argument('--epochs', type=int, default=100) | |
parser.add_argument('--seed', type=int, default=123) | |
args = parser.parse_args() | |
epochs = args.epochs | |
seed = args.seed | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed(args.seed) | |
cudnn.benchmark = True | |
task = 'delaney' | |
feature_num = 42 | |
segmentation_list = [range(0, 10), range(10, 17), range(17, 24), range(24, 30), range(30, 36), range(36, 38), range(38, 40), range(40, 42)] | |
segmentation_list = np.array(segmentation_list) | |
segmentation_num = len(segmentation_list) | |
max_atom_num = 55 | |
padding_size = 6 | |
embedding_dimension = 100 | |
############### Learning The Representation In An Unsupervised Way ############### | |
for running_index in range(5): | |
test_list = [running_index] | |
train_list = filter(lambda x: x not in test_list, np.arange(5)) | |
print('training list: {}\ttest list: {}'.format(train_list, test_list)) | |
dir_ = 'model_weight/{}/{}'.format(task, running_index) | |
if not os.path.isdir(dir_): | |
os.makedirs(dir_) | |
weight_file = '{}/{}_CBoW_non_segment.pt'.format(dir_, embedding_dimension) | |
model = CBoW(feature_num=feature_num, embedding_dim=embedding_dimension, | |
task_num=segmentation_num, task_size_list=segmentation_list) | |
if torch.cuda.is_available(): | |
model.cuda() | |
optimizer = optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-4) | |
train_dataset = NodeEmbeddingGraphDataset(task, K_list=train_list, segmentation_list=segmentation_list, padding_size=padding_size) | |
test_dataset = NodeEmbeddingGraphDataset(task, K_list=test_list, segmentation_list=segmentation_list, padding_size=padding_size) | |
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True) | |
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=True) | |
node_embedding_train() | |
node_embedding_test(train_dataloader) | |
node_embedding_test(test_dataloader) | |
print() | |
print('Done with node embedding on {}.'.format(running_index)) | |
model.eval() | |
for i in range(5): | |
dir_ = './datasets/{}'.format(task) | |
if not os.path.isdir(dir_): | |
os.makedirs(dir_) | |
data_path = '{}/{}_graph.npz'.format(dir_, i) | |
adjacent_matrix_list, distance_matrix_list, bond_attribute_matrix_list, node_attribute_matrix_list, kwargs = graph_embedding_get_data(data_path) | |
dataset = GraphEmbeddingGraphDataset(node_attribute_matrix_list=node_attribute_matrix_list, adjacent_matrix_list=adjacent_matrix_list, distance_matrix_list=distance_matrix_list) | |
dataloader = DataLoader(dataset, batch_size=128, shuffle=False) | |
embedded_node_matrix_list, embedded_graph_matrix_list = graph_embedding_get_walk_representation(dataloader) | |
dir_ = './datasets/{}/{}'.format(task, running_index) | |
if not os.path.isdir(dir_): | |
os.makedirs(dir_) | |
out_file_path = '{}/{}_grammed_cbow_{}_graph'.format(dir_, i, embedding_dimension) | |
kwargs['adjacent_matrix_list'] = adjacent_matrix_list | |
kwargs['distance_matrix_list'] = distance_matrix_list | |
kwargs['embedded_node_matrix_list'] = embedded_node_matrix_list | |
kwargs['embedded_graph_matrix_list'] = embedded_graph_matrix_list | |
np.savez_compressed(out_file_path, **kwargs) | |
print(kwargs.keys()) | |
print() | |
print() | |
print() | |
############### Running Model ############### | |
label_name = 'label_name' | |
n_gram_num = 6 | |
weight_file = 'temp.pt' | |
for running_index in range(5): | |
directory = './datasets/{}/{}/{{}}_grammed_cbow_{}_graph.npz'.format(task, running_index, embedding_dimension) | |
file_list = [] | |
for i in range(5): | |
file_list.append(directory.format(i)) | |
file_list = np.array(file_list) | |
print('file_list\t', file_list) | |
model = 'n_gram_xgb' | |
config_json_file = 'hyper/{}/{}.json'.format(model, task) | |
dir_ = './output/{}/{}'.format(model, running_index) | |
if not os.path.isdir(dir_): | |
os.makedirs(dir_) | |
run_n_gram_xgb() | |
os.rename('output_on_test.npz', '{}/{}.npz'.format(dir_, task)) | |
model = 'n_gram_rf' | |
config_json_file = 'hyper/{}/{}.json'.format(model, task) | |
dir_ = './output/{}/{}'.format(model, running_index) | |
if not os.path.isdir(dir_): | |
os.makedirs(dir_) | |
run_n_gram_rf() | |
os.rename('output_on_test.npz', '{}/{}.npz'.format(dir_, task)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment