Skip to content

Instantly share code, notes, and snippets.

@captainjackrana
Last active October 21, 2021 22:02
Show Gist options
  • Save captainjackrana/0df3bae3a41ebc049f975f72ad1aa762 to your computer and use it in GitHub Desktop.
Save captainjackrana/0df3bae3a41ebc049f975f72ad1aa762 to your computer and use it in GitHub Desktop.
Routine for predicting electrical conductivity of materials using the matminer library.
from matminer.datasets import load_dataset
from pymatgen.core import Composition
from matminer.featurizers.structure import DensityFeatures
from matminer.featurizers.conversions import StrToComposition
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor
import pprint
from automatminer import MatPipe
from automatminer import get_preset_config
from automatminer.automl.adaptors import TPOTAdaptor
# Custom config for TDOT to avoid error. See https://matsci.org/t/error-found-array-with-0-feature-s/4848/10
config_dict_1={'sklearn.ensemble.RandomForestRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
'min_samples_split': range(2, 21, 3),
'min_samples_leaf': range(1, 21, 3),
'bootstrap': [True, False]},
'sklearn.ensemble.GradientBoostingRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
'loss': ['ls', 'lad', 'huber', 'quantile'],
'learning_rate': [0.01, 0.1, 0.5, 1.0],
'max_depth': range(1, 11, 2),
'min_samples_split': range(2, 21, 3),
'min_samples_leaf': range(1, 21, 3),
'subsample': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
'max_features': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},
'sklearn.ensemble.ExtraTreesRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
'min_samples_split': range(2, 21, 3),
'min_samples_leaf': range(1, 21, 3),
'bootstrap': [True, False]},
'sklearn.tree.DecisionTreeRegressor': {'max_depth': range(1, 11, 2),
'min_samples_split': range(2, 21, 3),
'min_samples_leaf': range(1, 21, 3)},
'sklearn.neighbors.KNeighborsRegressor': {'n_neighbors': range(1, 101),
'weights': ['uniform', 'distance'],
'p': [1, 2]},
'sklearn.linear_model.Lasso': {'alpha': [1e-2, 1e-1, 1e0, 1e1, 1e2]}, #J alpha values taken from Takigawa-2019
'sklearn.linear_model.LassoLarsCV': {'normalize': [True, False]},
'sklearn.linear_model.RidgeCV': {},
'sklearn.linear_model.ElasticNetCV': {'l1_ratio': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},
'sklearn.preprocessing.MaxAbsScaler': {},
'sklearn.preprocessing.RobustScaler': {},
'sklearn.preprocessing.StandardScaler': {},
'sklearn.preprocessing.MinMaxScaler': {},
'sklearn.preprocessing.Normalizer': {'norm': ['l1', 'l2', 'max']},
'sklearn.preprocessing.PolynomialFeatures': {'degree': [2],
'include_bias': [False],
'interaction_only': [False]},
'sklearn.kernel_approximation.RBFSampler': {'gamma': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},
'sklearn.kernel_approximation.Nystroem': {'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2','sigmoid'],
'gamma': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
'n_components': range(1, 11)},
'tpot.builtins.ZeroCount': {},
'tpot.builtins.OneHotEncoder': {'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
'sparse': [False],
'threshold': [10]},
'sklearn.preprocessing.Binarizer': {'threshold': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},
'sklearn.cluster.FeatureAgglomeration': {'linkage': ['ward', 'complete', 'average'],
'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']},
'sklearn.feature_selection.SelectPercentile': {'percentile': range(1, 100),
'score_func': {'sklearn.feature_selection.f_regression': None}},
'sklearn.decomposition.PCA': {'svd_solver': ['randomized'],
'iterated_power': range(1, 11)},
'sklearn.decomposition.FastICA': {'tol': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},
'sklearn.feature_selection.VarianceThreshold': {'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}}
# Start ML
# Load power factor dataset
df = load_dataset("boltztrap_mp")
# Automatically converted to composition in pipeline
#df = StrToComposition(target_col_id='composition').featurize_dataframe(df, 'formula')
df = df.rename(columns={'formula':'composition'})
print(df)
#Convert power_factor to sigma value
df['sigma'] = df['pf_p']/(df['s_p']*df['s_p'])
ml_df = df[['sigma', 'composition']]
train_df, test_df = train_test_split(ml_df, test_size=0.2, shuffle=True, random_state=20191014)
# Our target property
target = 'sigma'
prediction_df = test_df.drop(columns=[target])
#pipe = MatPipe.from_preset("express")
config = get_preset_config("express")
config['learner'] = TPOTAdaptor(max_time_mins=1440,
max_eval_time_mins=20,
cv=5,
verbosity=3,
memory='auto',
template='Selector-Transformer-Regressor',
scoring='neg_mean_absolute_error',
config_dict=config_dict_1)
pipe = MatPipe(**config)
# Fit the model
pipe.fit(train_df, target)
# Time for prediction
prediction_df = pipe.predict(prediction_df)
# Check the predictions
prediction_df.head()
# Save pipeline for future
pipe.save("sigma_conductivity_pipeline.p")
# Start model accuracy evaluation
dr = DummyRegressor()
dr.fit(train_df["composition"], train_df[target])
dummy_test = dr.predict(test_df["composition"])
# Score dummy and MatPipe
true = test_df[target]
matpipe_test = prediction_df[target + " predicted"]
mae_matpipe = mean_absolute_error(true, matpipe_test)
mae_dummy = mean_absolute_error(true, dummy_test)
print("Dummy MAE: {}".format(mae_dummy))
print("MatPipe MAE: {}".format(mae_matpipe))
summary = pipe.summarize(filename="MatPipe_comp_summary.json")
pprint.pprint(summary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment