Last active
April 27, 2019 15:46
-
-
Save chao1224/1594dabba74956029216c9b551132cee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Mar 06 14:25:40 2017 | |
@author: Zhenqin Wu | |
""" | |
from __future__ import print_function | |
from __future__ import division | |
from __future__ import unicode_literals | |
import os | |
import time | |
import csv | |
import numpy as np | |
import tensorflow as tf | |
import deepchem | |
import pickle | |
from deepchem.molnet.run_benchmark_models import benchmark_classification, benchmark_regression | |
from deepchem.molnet.check_availability import CheckFeaturizer, CheckSplit | |
from deepchem.molnet.preset_hyper_parameters import hps | |
def run_benchmark(datasets, | |
model, | |
split=None, | |
metric=None, | |
direction=True, | |
featurizer=None, | |
n_features=0, | |
out_path='.', | |
hyper_parameters=None, | |
hyper_param_search=False, | |
max_iter=20, | |
search_range=2, | |
test=False, | |
reload=True, | |
seed=123): | |
""" | |
Run benchmark test on designated datasets with deepchem(or user-defined) model | |
Parameters | |
---------- | |
datasets: list of string | |
choice of which datasets to use, should be: bace_c, bace_r, bbbp, chembl, | |
clearance, clintox, delaney, hiv, hopv, kaggle, lipo, muv, nci, pcba, | |
pdbbind, ppb, qm7, qm7b, qm8, qm9, sampl, sider, tox21, toxcast | |
model: string or user-defined model stucture | |
choice of which model to use, deepchem provides implementation of | |
logistic regression, random forest, multitask network, | |
bypass multitask network, irv, graph convolution; | |
for user define model, it should include function: fit, evaluate | |
split: string, optional (default=None) | |
choice of splitter function, None = using the default splitter | |
metric: string, optional (default=None) | |
choice of evaluation metrics, None = using the default metrics(AUC & R2) | |
direction: bool, optional(default=True) | |
Optimization direction when doing hyperparameter search | |
Maximization(True) or minimization(False) | |
featurizer: string or dc.feat.Featurizer, optional (default=None) | |
choice of featurization, None = using the default corresponding to model | |
(string only applicable to deepchem models) | |
n_features: int, optional(default=0) | |
depending on featurizers, redefined when using deepchem featurizers, | |
need to be specified for user-defined featurizers(if using deepchem models) | |
out_path: string, optional(default='.') | |
path of result file | |
hyper_parameters: dict, optional (default=None) | |
hyper parameters for designated model, None = use preset values | |
hyper_param_search: bool, optional(default=False) | |
whether to perform hyper parameter search, using gaussian process by default | |
max_iter: int, optional(default=20) | |
number of optimization trials | |
search_range: int(float), optional(default=4) | |
optimization on [initial values / search_range, | |
initial values * search_range] | |
test: boolean, optional(default=False) | |
whether to evaluate on test set | |
reload: boolean, optional(default=True) | |
whether to save and reload featurized datasets | |
""" | |
for dataset in datasets: | |
if dataset in [ | |
'bace_c', 'bbbp', 'clintox', 'hiv', 'muv', 'pcba', 'sider', 'tox21', | |
'toxcast' | |
]: | |
mode = 'classification' | |
if metric == None: | |
metric = [ | |
deepchem.metrics.Metric(deepchem.metrics.roc_auc_score, np.mean), | |
] | |
elif dataset in [ | |
'bace_r', 'chembl', 'clearance', 'delaney', 'hopv', 'kaggle', 'lipo', | |
'nci', 'pdbbind', 'ppb', 'qm7', 'qm7b', 'qm8', 'qm9', 'sampl' | |
]: | |
mode = 'regression' | |
if metric == None: | |
metric = [ | |
deepchem.metrics.Metric(deepchem.metrics.pearson_r2_score, np.mean) | |
] | |
else: | |
raise ValueError('Dataset not supported') | |
if featurizer == None and isinstance(model, basestring): | |
# Assigning featurizer if not user defined | |
pair = (dataset, model) | |
if pair in CheckFeaturizer: | |
featurizer = CheckFeaturizer[pair][0] | |
n_features = CheckFeaturizer[pair][1] | |
else: | |
continue | |
if not split in [None] + CheckSplit[dataset]: | |
continue | |
loading_functions = { | |
'bace_c': deepchem.molnet.load_bace_classification, | |
'bace_r': deepchem.molnet.load_bace_regression, | |
'bbbp': deepchem.molnet.load_bbbp, | |
'chembl': deepchem.molnet.load_chembl, | |
'clearance': deepchem.molnet.load_clearance, | |
'clintox': deepchem.molnet.load_clintox, | |
'delaney': deepchem.molnet.load_delaney, | |
'hiv': deepchem.molnet.load_hiv, | |
'hopv': deepchem.molnet.load_hopv, | |
'kaggle': deepchem.molnet.load_kaggle, | |
'lipo': deepchem.molnet.load_lipo, | |
'muv': deepchem.molnet.load_muv, | |
'nci': deepchem.molnet.load_nci, | |
'pcba': deepchem.molnet.load_pcba, | |
'pdbbind': deepchem.molnet.load_pdbbind_grid, | |
'ppb': deepchem.molnet.load_ppb, | |
'qm7': deepchem.molnet.load_qm7_from_mat, | |
'qm7b': deepchem.molnet.load_qm7b_from_mat, | |
'qm8': deepchem.molnet.load_qm8, | |
'qm9': deepchem.molnet.load_qm9, | |
'sampl': deepchem.molnet.load_sampl, | |
'sider': deepchem.molnet.load_sider, | |
'tox21': deepchem.molnet.load_tox21, | |
'toxcast': deepchem.molnet.load_toxcast | |
} | |
print('-------------------------------------') | |
print('Benchmark on dataset: %s' % dataset) | |
print('-------------------------------------') | |
# loading datasets | |
if split is not None: | |
print('Splitting function: %s' % split) | |
tasks, all_dataset, transformers = loading_functions[dataset]( | |
featurizer=featurizer, split=split, reload=reload) | |
else: | |
tasks, all_dataset, transformers = loading_functions[dataset]( | |
featurizer=featurizer, reload=reload) | |
train_dataset, valid_dataset, test_dataset = all_dataset | |
time_start_fitting = time.time() | |
train_score = {} | |
valid_score = {} | |
test_score = {} | |
if hyper_param_search: | |
if hyper_parameters is None: | |
hyper_parameters = hps[model] | |
search_mode = deepchem.hyper.GaussianProcessHyperparamOpt(model) | |
hyper_param_opt, _ = search_mode.hyperparam_search( | |
hyper_parameters, | |
train_dataset, | |
valid_dataset, | |
transformers, | |
metric, | |
direction=direction, | |
n_features=n_features, | |
n_tasks=len(tasks), | |
max_iter=max_iter, | |
search_range=search_range) | |
hyper_parameters = hyper_param_opt | |
if isinstance(model, basestring): | |
if mode == 'classification': | |
train_score, valid_score, test_score = benchmark_classification( | |
train_dataset, | |
valid_dataset, | |
test_dataset, | |
tasks, | |
transformers, | |
n_features, | |
metric, | |
model, | |
test=test, | |
hyper_parameters=hyper_parameters, | |
seed=seed) | |
elif mode == 'regression': | |
train_score, valid_score, test_score = benchmark_regression( | |
train_dataset, | |
valid_dataset, | |
test_dataset, | |
tasks, | |
transformers, | |
n_features, | |
metric, | |
model, | |
test=test, | |
hyper_parameters=hyper_parameters, | |
seed=seed) | |
else: | |
model.fit(train_dataset) | |
train_score['user_defined'] = model.evaluate(train_dataset, metric, | |
transformers) | |
valid_score['user_defined'] = model.evaluate(valid_dataset, metric, | |
transformers) | |
if test: | |
test_score['user_defined'] = model.evaluate(test_dataset, metric, | |
transformers) | |
time_finish_fitting = time.time() | |
with open(os.path.join(out_path, 'results.csv'), 'a') as f: | |
writer = csv.writer(f) | |
model_name = list(train_score.keys())[0] | |
for i in train_score[model_name]: | |
output_line = [ | |
dataset, | |
str(split), mode, model_name, i, 'train', | |
train_score[model_name][i], 'valid', valid_score[model_name][i] | |
] | |
if test: | |
output_line.extend(['test', test_score[model_name][i]]) | |
output_line.extend( | |
['time_for_running', time_finish_fitting - time_start_fitting]) | |
writer.writerow(output_line) | |
if hyper_param_search: | |
with open(os.path.join(out_path, dataset + model + '.pkl'), 'w') as f: | |
pickle.dump(hyper_parameters, f) | |
# | |
# Note by @XericZephyr. Reason why I spun off this function: | |
# 1. Some model needs dataset information. | |
# 2. It offers us possibility to **cache** the dataset | |
# if the featurizer runs very slow, e.g., GraphConv. | |
# 2+. The cache can even happen at Travis CI to accelerate | |
# CI testing. | |
# | |
def load_dataset(dataset, featurizer, split='random'): | |
""" | |
Load specific dataset for benchmark. | |
Parameters | |
---------- | |
dataset: string | |
choice of which datasets to use, should be: tox21, muv, sider, | |
toxcast, pcba, delaney, kaggle, nci, clintox, hiv, pdbbind, chembl, | |
qm7, qm7b, qm9, sampl | |
featurizer: string or dc.feat.Featurizer. | |
choice of featurization. | |
split: string, optional (default=None) | |
choice of splitter function, None = using the default splitter | |
""" | |
dataset_loading_functions = { | |
'bace_c': deepchem.molnet.load_bace_classification, | |
'bace_r': deepchem.molnet.load_bace_regression, | |
'bbbp': deepchem.molnet.load_bbbp, | |
'chembl': deepchem.molnet.load_chembl, | |
'clearance': deepchem.molnet.load_clearance, | |
'clintox': deepchem.molnet.load_clintox, | |
'delaney': deepchem.molnet.load_delaney, | |
'hiv': deepchem.molnet.load_hiv, | |
'hopv': deepchem.molnet.load_hopv, | |
'kaggle': deepchem.molnet.load_kaggle, | |
'lipo': deepchem.molnet.load_lipo, | |
'muv': deepchem.molnet.load_muv, | |
'nci': deepchem.molnet.load_nci, | |
'pcba': deepchem.molnet.load_pcba, | |
'pdbbind': deepchem.molnet.load_pdbbind_grid, | |
'ppb': deepchem.molnet.load_ppb, | |
'qm7': deepchem.molnet.load_qm7_from_mat, | |
'qm7b': deepchem.molnet.load_qm7b_from_mat, | |
'qm8': deepchem.molnet.load_qm8, | |
'qm9': deepchem.molnet.load_qm9, | |
'sampl': deepchem.molnet.load_sampl, | |
'sider': deepchem.molnet.load_sider, | |
'tox21': deepchem.molnet.load_tox21, | |
'toxcast': deepchem.molnet.load_toxcast | |
} | |
print('-------------------------------------') | |
print('Loading dataset: %s' % dataset) | |
print('-------------------------------------') | |
# loading datasets | |
if split is not None: | |
print('Splitting function: %s' % split) | |
tasks, all_dataset, transformers = dataset_loading_functions[dataset]( | |
featurizer=featurizer, split=split) | |
return tasks, all_dataset, transformers | |
def benchmark_model(model, all_dataset, transformers, metric, test=False): | |
""" | |
Benchmark custom model. | |
model: user-defined model stucture | |
For user define model, it should include function: fit, evaluate. | |
all_dataset: (train, test, val) data tuple. | |
Returned by `load_dataset` function. | |
transformers | |
metric: string | |
choice of evaluation metrics. | |
""" | |
time_start_fitting = time.time() | |
train_score = .0 | |
valid_score = .0 | |
test_score = .0 | |
train_dataset, valid_dataset, test_dataset = all_dataset | |
model.fit(train_dataset) | |
train_score = model.evaluate(train_dataset, metric, transformers) | |
valid_score = model.evaluate(valid_dataset, metric, transformers) | |
if test: | |
test_score = model.evaluate(test_dataset, metric, transformers) | |
time_finish_fitting = time.time() | |
time_for_running = time_finish_fitting - time_start_fitting | |
return train_score, valid_score, test_score, time_for_running | |
if __name__ == '__main__': | |
run_benchmark(datasets=['muv'], model='weave', | |
split='random', metric=None, featurizer='Weave', | |
out_path='.', seed=123) |
here's my sh file.
#!/usr/bin/env bash
# This script sets up a conda environment to run TensorFlow on the GZK nodes.
# Created by Sam Gelman ([email protected]) with help from Jay Wang ([email protected]) and Shengchao Liu (shengchao.liu
@wisc.edu)
# echo some HTCondor job information
echo _CONDOR_JOB_IWD $_CONDOR_JOB_IWD
echo Cluster $cluster
echo Process $process
echo RunningOn $runningon
# this makes it easier to set up the environments, since the PWD we are running in is not $HOME
export HOME=$PWD
# set up miniconda and add it to path
bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3 > /dev/null
export PATH=$PATH:~/miniconda3/bin
export DEEPCHEM_DATA_DIR=$HOME
conda update -n base -c defaults conda --yes -q
git clone https://github.com/deepchem/deepchem.git
bash install_deepchem_conda.sh dctest
cd deepchem
source activate dctest
python setup.py install
cd ..
`which python` runMolNetBenchmark.py
and heres install_deepchem_conda.sh, which I pass in using my sub file.
#!/usr/bin/env bash
if [ -z "$1" ];
then
echo "Installing DeepChem in current env"
else
export envname=$1
conda create -y --name $envname python=$python_version
source activate $envname
fi
conda install cython -y -q
conda install -y -q -c omnia pdbfixer=1.4
conda install -y -q -c deepchem mdtraj=1.9.1
conda install -y -q -c rdkit rdkit=2017.09.1
conda install -y -q -c conda-forge joblib=0.11 \
six=1.11.0 \
scikit-learn=0.19.1 \
networkx=2.1 \
pillow=5.0.0 \
pandas=0.22.0 \
nose=1.3.7 \
nose-timer=0.7.0 \
flaky=3.3.0 \
zlib=1.2.11 \
requests=2.18.4 \
xgboost=0.6a2 \
simdna=0.4.2 \
pbr=3.1.1 \
setuptools=39.0.1 \
biopython=1.71 \
numpy=1.14
conda install tensorflow-gpu==1.13.1 --yes -q
Got the following exception:
2019-04-26 22:30:07.231432: I tensorflow/core/common_runtime/gpu/gpu_device.cc:961] DMA: 0
2019-04-26 22:30:07.231438: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: Y
2019-04-26 22:30:07.231447: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K40m, pci bus id: 0000:04:00.0)
2019-04-26 22:30:12.161834: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 2456 get requests, put_count=2456 evicted_count=1000 eviction_rate=0.407166 and unsatisfied allocation rate=0.447883
2019-04-26 22:30:12.161884: I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100 to 110
Traceback (most recent call last):
File "model_weave.py", line 111, in <module>
train_scores = model.evaluate(train_dataset, metrics, transformers)
File "/var/lib/condor/execute/slot1/dir_32585/n_gram/deepchem/deepchem/models/models.py", line 192, in evaluate
scores = evaluator.compute_model_performance(metrics)
File "/var/lib/condor/execute/slot1/dir_32585/n_gram/deepchem/deepchem/utils/evaluate.py", line 103, in compute_model_performance
self.output_transformers).astype(int)
AttributeError: 'list' object has no attribute 'astype'
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's the script I used to call the function