captainjackrana · October 21, 2021 22:02
diff --git a/electrical_conductivity_ml.py b/electrical_conductivity_ml.py
 from matminer.datasets import load_dataset
 from pymatgen.core import Composition
 from matminer.featurizers.structure import DensityFeatures
 from matminer.featurizers.conversions import StrToComposition
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import mean_absolute_error
 from sklearn.dummy import DummyRegressor
 import pprint

 from automatminer import MatPipe
 from automatminer import get_preset_config
 from automatminer.automl.adaptors import TPOTAdaptor

 # Custom config for TDOT to avoid error. See https://matsci.org/t/error-found-array-with-0-feature-s/4848/10

 config_dict_1={'sklearn.ensemble.RandomForestRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
 'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
 'min_samples_split': range(2, 21, 3),
 'min_samples_leaf': range(1, 21, 3),
 'bootstrap': [True, False]},

 'sklearn.ensemble.GradientBoostingRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
 'loss': ['ls', 'lad', 'huber', 'quantile'],
 'learning_rate': [0.01, 0.1, 0.5, 1.0],
 'max_depth': range(1, 11, 2),
 'min_samples_split': range(2, 21, 3),
 'min_samples_leaf': range(1, 21, 3),
 'subsample': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
 'max_features': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
  0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
 'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},

  'sklearn.ensemble.ExtraTreesRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
  'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
  'min_samples_split': range(2, 21, 3),
  'min_samples_leaf': range(1, 21, 3),
  'bootstrap': [True, False]},

   'sklearn.tree.DecisionTreeRegressor': {'max_depth': range(1, 11, 2),
   'min_samples_split': range(2, 21, 3),
   'min_samples_leaf': range(1, 21, 3)},

 'sklearn.neighbors.KNeighborsRegressor': {'n_neighbors': range(1, 101),
 'weights': ['uniform', 'distance'],
  'p': [1, 2]},

 'sklearn.linear_model.Lasso': {'alpha': [1e-2, 1e-1, 1e0, 1e1, 1e2]}, #J alpha values taken from Takigawa-2019
  'sklearn.linear_model.LassoLarsCV': {'normalize': [True, False]},

   'sklearn.linear_model.RidgeCV': {},

 'sklearn.linear_model.ElasticNetCV': {'l1_ratio': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
 'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},



 'sklearn.preprocessing.MaxAbsScaler': {},
  'sklearn.preprocessing.RobustScaler': {},
   'sklearn.preprocessing.StandardScaler': {},
 'sklearn.preprocessing.MinMaxScaler': {},
 'sklearn.preprocessing.Normalizer': {'norm': ['l1', 'l2', 'max']},


  'sklearn.preprocessing.PolynomialFeatures': {'degree': [2],
   'include_bias': [False],
 'interaction_only': [False]},

   'sklearn.kernel_approximation.RBFSampler': {'gamma': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},


 'sklearn.kernel_approximation.Nystroem': {'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2','sigmoid'],
 'gamma': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
  'n_components': range(1, 11)},

 'tpot.builtins.ZeroCount': {},

  'tpot.builtins.OneHotEncoder': {'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
   'sparse': [False],
 'threshold': [10]},

   'sklearn.preprocessing.Binarizer': {'threshold': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
   0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},

 'sklearn.cluster.FeatureAgglomeration': {'linkage': ['ward', 'complete', 'average'],
 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']},
 'sklearn.feature_selection.SelectPercentile': {'percentile': range(1, 100),
 'score_func': {'sklearn.feature_selection.f_regression': None}},


  'sklearn.decomposition.PCA': {'svd_solver': ['randomized'],
   'iterated_power': range(1, 11)},

   'sklearn.decomposition.FastICA': {'tol': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},


 'sklearn.feature_selection.VarianceThreshold': {'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}}

 # Start ML
 # Load power factor dataset
 df = load_dataset("boltztrap_mp")

 # Automatically converted to composition in pipeline
 #df = StrToComposition(target_col_id='composition').featurize_dataframe(df, 'formula')
 df = df.rename(columns={'formula':'composition'})
 print(df)

 #Convert power_factor to sigma value
 df['sigma'] = df['pf_p']/(df['s_p']*df['s_p'])
 ml_df = df[['sigma', 'composition']]
 train_df, test_df = train_test_split(ml_df, test_size=0.2, shuffle=True, random_state=20191014)

 # Our target property
 target = 'sigma'
 prediction_df = test_df.drop(columns=[target])

 #pipe = MatPipe.from_preset("express")
 config = get_preset_config("express")
 config['learner'] = TPOTAdaptor(max_time_mins=1440,
        max_eval_time_mins=20,
        cv=5,
        verbosity=3,
        memory='auto',
        template='Selector-Transformer-Regressor',
        scoring='neg_mean_absolute_error',
        config_dict=config_dict_1)
 pipe = MatPipe(**config)
 # Fit the model
 pipe.fit(train_df, target)

 # Time for prediction
 prediction_df = pipe.predict(prediction_df)
 # Check the predictions
 prediction_df.head()

 # Save pipeline for future
 pipe.save("sigma_conductivity_pipeline.p")

 # Start model accuracy evaluation
 dr = DummyRegressor()
 dr.fit(train_df["composition"], train_df[target])
 dummy_test = dr.predict(test_df["composition"])

 # Score dummy and MatPipe
 true = test_df[target]
 matpipe_test = prediction_df[target + " predicted"]

 mae_matpipe = mean_absolute_error(true, matpipe_test)
 mae_dummy = mean_absolute_error(true, dummy_test)

 print("Dummy MAE: {}".format(mae_dummy))
 print("MatPipe MAE: {}".format(mae_matpipe))

 summary = pipe.summarize(filename="MatPipe_comp_summary.json")

 pprint.pprint(summary)
	from matminer.datasets import load_dataset
	from pymatgen.core import Composition
	from matminer.featurizers.structure import DensityFeatures
	from matminer.featurizers.conversions import StrToComposition
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_absolute_error
	from sklearn.dummy import DummyRegressor
	import pprint

	from automatminer import MatPipe
	from automatminer import get_preset_config
	from automatminer.automl.adaptors import TPOTAdaptor

	# Custom config for TDOT to avoid error. See https://matsci.org/t/error-found-array-with-0-feature-s/4848/10

	config_dict_1={'sklearn.ensemble.RandomForestRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
	'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
	'min_samples_split': range(2, 21, 3),
	'min_samples_leaf': range(1, 21, 3),
	'bootstrap': [True, False]},

	'sklearn.ensemble.GradientBoostingRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
	'loss': ['ls', 'lad', 'huber', 'quantile'],
	'learning_rate': [0.01, 0.1, 0.5, 1.0],
	'max_depth': range(1, 11, 2),
	'min_samples_split': range(2, 21, 3),
	'min_samples_leaf': range(1, 21, 3),
	'subsample': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
	0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
	'max_features': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
	0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
	'alpha': [0.75, 0.8, 0.85, 0.9, 0.95, 0.99]},

	'sklearn.ensemble.ExtraTreesRegressor': {'n_estimators': [20, 100, 200, 500, 1000],
	'max_features': [0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
	'min_samples_split': range(2, 21, 3),
	'min_samples_leaf': range(1, 21, 3),
	'bootstrap': [True, False]},

	'sklearn.tree.DecisionTreeRegressor': {'max_depth': range(1, 11, 2),
	'min_samples_split': range(2, 21, 3),
	'min_samples_leaf': range(1, 21, 3)},

	'sklearn.neighbors.KNeighborsRegressor': {'n_neighbors': range(1, 101),
	'weights': ['uniform', 'distance'],
	'p': [1, 2]},

	'sklearn.linear_model.Lasso': {'alpha': [1e-2, 1e-1, 1e0, 1e1, 1e2]}, #J alpha values taken from Takigawa-2019
	'sklearn.linear_model.LassoLarsCV': {'normalize': [True, False]},

	'sklearn.linear_model.RidgeCV': {},

	'sklearn.linear_model.ElasticNetCV': {'l1_ratio': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
	'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},



	'sklearn.preprocessing.MaxAbsScaler': {},
	'sklearn.preprocessing.RobustScaler': {},
	'sklearn.preprocessing.StandardScaler': {},
	'sklearn.preprocessing.MinMaxScaler': {},
	'sklearn.preprocessing.Normalizer': {'norm': ['l1', 'l2', 'max']},


	'sklearn.preprocessing.PolynomialFeatures': {'degree': [2],
	'include_bias': [False],
	'interaction_only': [False]},

	'sklearn.kernel_approximation.RBFSampler': {'gamma': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},


	'sklearn.kernel_approximation.Nystroem': {'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2','sigmoid'],
	'gamma': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.],
	'n_components': range(1, 11)},

	'tpot.builtins.ZeroCount': {},

	'tpot.builtins.OneHotEncoder': {'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
	'sparse': [False],
	'threshold': [10]},

	'sklearn.preprocessing.Binarizer': {'threshold': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
	0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},

	'sklearn.cluster.FeatureAgglomeration': {'linkage': ['ward', 'complete', 'average'],
	'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']},
	'sklearn.feature_selection.SelectPercentile': {'percentile': range(1, 100),
	'score_func': {'sklearn.feature_selection.f_regression': None}},


	'sklearn.decomposition.PCA': {'svd_solver': ['randomized'],
	'iterated_power': range(1, 11)},

	'sklearn.decomposition.FastICA': {'tol': [0., 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
	0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.]},


	'sklearn.feature_selection.VarianceThreshold': {'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}}

	# Start ML
	# Load power factor dataset
	df = load_dataset("boltztrap_mp")

	# Automatically converted to composition in pipeline
	#df = StrToComposition(target_col_id='composition').featurize_dataframe(df, 'formula')
	df = df.rename(columns={'formula':'composition'})
	print(df)

	#Convert power_factor to sigma value
	df['sigma'] = df['pf_p']/(df['s_p']*df['s_p'])
	ml_df = df[['sigma', 'composition']]
	train_df, test_df = train_test_split(ml_df, test_size=0.2, shuffle=True, random_state=20191014)

	# Our target property
	target = 'sigma'
	prediction_df = test_df.drop(columns=[target])

	#pipe = MatPipe.from_preset("express")
	config = get_preset_config("express")
	config['learner'] = TPOTAdaptor(max_time_mins=1440,
	max_eval_time_mins=20,
	cv=5,
	verbosity=3,
	memory='auto',
	template='Selector-Transformer-Regressor',
	scoring='neg_mean_absolute_error',
	config_dict=config_dict_1)
	pipe = MatPipe(**config)
	# Fit the model
	pipe.fit(train_df, target)

	# Time for prediction
	prediction_df = pipe.predict(prediction_df)
	# Check the predictions
	prediction_df.head()

	# Save pipeline for future
	pipe.save("sigma_conductivity_pipeline.p")

	# Start model accuracy evaluation
	dr = DummyRegressor()
	dr.fit(train_df["composition"], train_df[target])
	dummy_test = dr.predict(test_df["composition"])

	# Score dummy and MatPipe
	true = test_df[target]
	matpipe_test = prediction_df[target + " predicted"]

	mae_matpipe = mean_absolute_error(true, matpipe_test)
	mae_dummy = mean_absolute_error(true, dummy_test)

	print("Dummy MAE: {}".format(mae_dummy))
	print("MatPipe MAE: {}".format(mae_matpipe))

	summary = pipe.summarize(filename="MatPipe_comp_summary.json")

	pprint.pprint(summary)