Last active
June 14, 2018 20:22
-
-
Save qpwo/dce25a61c8627e2c6db4d0b9a87e5c5d to your computer and use it in GitHub Desktop.
The conf file for a run of "mastml from the future" where all the important features have been added
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# You run this with `$ python mastml.py settings.conf data.csv -o results/` | |
# Second example: `$ python mastml.py input.conf compositions.csv -o Desktop/model-results/` | |
# Or you open the website, upload a csv, upload a conf file, and download the resulting zip file | |
# Sections and subsections are in CamelCase; parameters are in snake_case | |
[GeneralSetup] | |
input_features = Auto # Defaults to all but last column (specifying Auto is same as omiting this option) | |
#input_features = square_footage, crime_rate, year_built # you can specify which columns from the csv you'd like to keep | |
target_feature = Auto # Defaults to last column | |
[FeatureNormalization] | |
# You can specify which features to normalize with keywords `input` and `target`, or you can explicityly specify which features: | |
#include = Auto # Normalize all features | |
#include = input # Normalize just input features | |
#include = feat1, feat4 # Just normalize feature1 and feature4 from csv | |
#TODO: add all sklearn feature normalization routines, plus our own | |
normalization_type = standardize # Either standardize or normalize (??) | |
mean = 0 | |
stddev = 1 | |
# Or | |
# min = 0 | |
# max = 1 | |
# I'm not sure the best way to organize this | |
[FeatureGeneration] # If you don't want to use a particular API, then omit or comment out that subsection. | |
# If you don't want to do any feature generation, then omit this entire section | |
#TODO: add all sklearn feature generation routines, plus our own | |
[[magpie]] | |
api_key = 1234 | |
[[materials_project]] | |
api_key = 1234 | |
on_missing = error # Optional arg, defaults to 'ignore' | |
[[citrine]] | |
api_key=1234 | |
[[custom]] | |
area = length * width # create new columns in the dataframe using algebra on existing columns | |
[FeatureSelection] | |
#regression scoring functions: f_regression, mutual_info_regression | |
#classification scoring functions: chi2, f_classif, mutual_info_classif | |
#TODO: add all sklearn feature selection routines, plus our own | |
[[RemoveConstantFeatures]] | |
[[PrincipleComponentAnalysis]] | |
[SelectKBest] | |
k = 3 | |
scoring = f_classif | |
[[VarianceThreshold]] | |
threshold = 0.2 | |
[[SelectPercentile]] | |
percentile = 10 # percentage of features to keep | |
scoring = chi2 | |
[[SelectFpr]] # Select features based on a false positive rate test. | |
[[SelectFdr]] # Select features based on an estimated false discovery rate. | |
[[SelectFwe]] # Select features based on family-wise error rate. | |
[[GenericUnivariateSelect]] # Univariate feature selector with configurable mode. | |
[[RFE]] # recursive feature elimination | |
[DataSplits] # mastml will train & test the model on each of the following data splits. | |
# Omit a split to not use it. | |
# Or list a split without args to use it with default args. | |
# Almost every option has a default argument. | |
#TODO: add all sklearn data split routines, plus our own | |
# Classification metrics: accuracy, average_precision, f1, f1_micro, f1_macro, f1_weighted, f1_samples, neg_log_loss, precision, recall, roc_auc, | |
# Regression metrics: explained_variance, neg_mean_absolute_error, neg_mean_squared_error, neg_mean_squared_log_error, neg_median_absolute_error, r2, | |
[[NoSplit]] # Just train the model on the training data and test it on that same data | |
scoring_metric = accuracy # could also by r^2, or other things. determines definition of 'best' for plotting | |
plots = best, worst, best_vs_avg, avg # Plots to save from this run | |
#plots = Auto # just use the default plots (alternative) | |
stats = accuracy, precision, recall # The stats to collect on each run. These will be saved with plot | |
#stats = Auto # just use the default stats (alternative) | |
[[Randomize]] # Randomly remap x and y values (just to see how the model does with nonsense input | |
[[KFoldRandomize]] # First randomize all the x-y matchings, then see how well the nonsense-model does on cross validation | |
k = 5 | |
[[WithoutEachCluster]] # First, cluster the data, then do a run without each cluster | |
cluster_features = input # use all input features, Or: | |
#cluster_features = age, gender, height # just use these features to make clusters | |
algorithm = kmeans | |
num_clusters = auto # use default number of clusters for algorithm | |
#num_clusters = 5 # directly say number of clusters desired (alternative) | |
[[JustEachGroup]] # Train the model on one group at a time and test it on the rest of the data | |
grouping_feature = class # The name of the column in the csv which contains classes | |
plots = best, worst | |
[[WithoutEachGroup]] # Train the model on (n-1)/n groups and test on the excluded group | |
grouping_feature = class | |
plots = avg_vs_worst | |
[[WithoutElement]] # Train the model without each element, then test on the rows with that element | |
composition_column = compositions # the name of the column in the csv containing the compositions | |
element = C # For carbon, for example | |
[[KFold]] # Split the data into k even chunks. Try training/testing on each chunk. | |
k = 5 | |
[[LeaveOutPercent]] # Like KFold but percentage | |
percentage = 20 | |
[[LeaveOneOut]] # Try training the model on all the data except one row, for every row | |
[Models] # List the sklearn models you want to use and the parameters you want to change. | |
# Your models should be either all classifiers or all regressors. No mixing allowed. | |
# All parameters have default settings. If you ommit a parameter, then the default setting is used. | |
# Go to http://scikit-learn.org/stable/documentation.html and search for details on a particular model. | |
# A single conf file should use zero or more models. | |
# mastml will run each enabled split with each enabled model | |
# CLASSIFIERS: | |
#[[AdaBoostClassifier]] | |
# algorithm | |
# base_estimator | |
# learning_rate | |
# n_estimators | |
# random_state | |
#[[BaggingClassifier]] | |
# base_estimator | |
# bootstrap | |
# bootstrap_features | |
# max_features | |
# max_samples | |
# n_estimators | |
# n_jobs | |
# oob_score | |
# random_state | |
# verbose | |
# warm_start | |
#[[BernoulliNB]] | |
# alpha | |
# binarize | |
# class_prior | |
# fit_prior | |
#[[CalibratedClassifierCV]] | |
# base_estimator | |
# cv | |
# method | |
#[[DecisionTreeClassifier]] | |
# class_weight | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# presort | |
# random_state | |
# splitter | |
#[[ExtraTreeClassifier]] | |
# class_weight | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# random_state | |
# splitter | |
#[[ExtraTreesClassifier]] | |
# bootstrap | |
# class_weight | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# n_estimators | |
# n_jobs | |
# oob_score | |
# random_state | |
# verbose | |
# warm_start | |
#[[GaussianNB]] | |
#priors | |
#[[GaussianProcessClassifier]] | |
# copy_X_train | |
# kernel | |
# max_iter_predict | |
# multi_class | |
# n_jobs | |
# n_restarts_optimizer | |
# optimizer | |
# random_state | |
# warm_start | |
#[[GradientBoostingClassifier]] | |
# criterion | |
# init | |
# learning_rate | |
# loss | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# n_estimators | |
# presort | |
# random_state | |
# subsample | |
# verbose | |
# warm_start | |
#[[KNeighborsClassifier]] | |
# algorithm | |
# leaf_size | |
# metric | |
# metric_params | |
# n_jobs | |
# n_neighbors | |
# p | |
# weights | |
#[[LabelPropagation]] | |
# alpha | |
# gamma | |
# kernel | |
# max_iter | |
# n_jobs | |
# n_neighbors | |
# tol | |
#[[LabelSpreading]] | |
# alpha | |
# gamma | |
# kernel | |
# max_iter | |
# n_jobs | |
# n_neighbors | |
# tol | |
#[[LinearDiscriminantAnalysis]] | |
# n_components | |
# priors | |
# shrinkage | |
# solver | |
# store_covariance | |
# tol | |
#[[LinearSVC]] | |
# C | |
# class_weight | |
# dual | |
# fit_intercept | |
# intercept_scaling | |
# loss | |
# max_iter | |
# multi_class | |
# penalty | |
# random_state | |
# tol | |
# verbose | |
#[[LogisticRegression]] | |
# C | |
# class_weight | |
# dual | |
# fit_intercept | |
# intercept_scaling | |
# max_iter | |
# multi_class | |
# n_jobs | |
# penalty | |
# random_state | |
# solver | |
# tol | |
# verbose | |
# warm_start | |
#[[LogisticRegressionCV]] | |
# Cs | |
# class_weight | |
# cv | |
# dual | |
# fit_intercept | |
# intercept_scaling | |
# max_iter | |
# multi_class | |
# n_jobs | |
# penalty | |
# random_state | |
# refit | |
# scoring | |
# solver | |
# tol | |
# verbose | |
#[[MLPClassifier]] | |
# activation | |
# alpha | |
# batch_size | |
# beta_1 | |
# beta_2 | |
# early_stopping | |
# epsilon | |
# hidden_layer_sizes | |
# learning_rate | |
# learning_rate_init | |
# max_iter | |
# momentum | |
# nesterovs_momentum | |
# power_t | |
# random_state | |
# shuffle | |
# solver | |
# tol | |
# validation_fraction | |
# verbose | |
# warm_start | |
#[[MultinomialNB]] | |
# alpha | |
# class_prior | |
# fit_prior | |
#[[NearestCentroid]] | |
# metric | |
# shrink_threshold | |
#[[NuSVC]] | |
# cache_size | |
# class_weight | |
# coef0 | |
# decision_function_shape | |
# degree | |
# gamma | |
# kernel | |
# max_iter | |
# nu | |
# probability | |
# random_state | |
# shrinking | |
# tol | |
# verbose | |
#[[PassiveAggressiveClassifier]] | |
# C | |
# average | |
# class_weight | |
# fit_intercept | |
# loss | |
# max_iter | |
# n_iter | |
# n_jobs | |
# random_state | |
# shuffle | |
# tol | |
# verbose | |
# warm_start | |
#[[Perceptron]] | |
# alpha | |
# class_weight | |
# eta0 | |
# fit_intercept | |
# max_iter | |
# n_iter | |
# n_jobs | |
# penalty | |
# random_state | |
# shuffle | |
# tol | |
# verbose | |
# warm_start | |
#[[QuadraticDiscriminantAnalysis]] | |
# priors | |
# reg_param | |
# store_covariance | |
# store_covariances | |
# tol | |
#[[RadiusNeighborsClassifier]] | |
# algorithm | |
# leaf_size | |
# metric | |
# metric_params | |
# outlier_label | |
# p | |
# radius | |
# weights | |
#[[RandomForestClassifier]] | |
# bootstrap | |
# class_weight | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# n_estimators | |
# n_jobs | |
# oob_score | |
# random_state | |
# verbose | |
# warm_start | |
#[[RidgeClassifier]] | |
# alpha | |
# class_weight | |
# copy_X | |
# fit_intercept | |
# max_iter | |
# normalize | |
# random_state | |
# solver | |
# tol | |
#[[RidgeClassifierCV]] | |
# alphas | |
# class_weight | |
# cv | |
# fit_intercept | |
# normalize | |
# scoring | |
#[[SGDClassifier]] | |
# alpha | |
# average | |
# class_weight | |
# epsilon | |
# eta0 | |
# fit_intercept | |
# l1_ratio | |
# learning_rate | |
# loss | |
# max_iter | |
# n_iter | |
# n_jobs | |
# penalty | |
# power_t | |
# random_state | |
# shuffle | |
# tol | |
# verbose | |
# warm_start | |
#[[SVC]] | |
# C | |
# cache_size | |
# class_weight | |
# coef0 | |
# decision_function_shape | |
# degree | |
# gamma | |
# kernel | |
# max_iter | |
# probability | |
# random_state | |
# shrinking | |
# tol | |
# verbose | |
# REGRESSORS | |
#[[ARDRegression]] | |
# alpha_1 | |
# alpha_2 | |
# compute_score | |
# copy_X | |
# fit_intercept | |
# lambda_1 | |
# lambda_2 | |
# n_iter | |
# normalize | |
# threshold_lambda | |
# tol | |
# verbose | |
#[[AdaBoostRegressor]] | |
# base_estimator | |
# learning_rate | |
# loss | |
# n_estimators | |
# random_state | |
#[[BaggingRegressor]] | |
# base_estimator | |
# bootstrap | |
# bootstrap_features | |
# max_features | |
# max_samples | |
# n_estimators | |
# n_jobs | |
# oob_score | |
# random_state | |
# verbose | |
# warm_start | |
#[[BayesianRidge]] | |
# alpha_1 | |
# alpha_2 | |
# compute_score | |
# copy_X | |
# fit_intercept | |
# lambda_1 | |
# lambda_2 | |
# n_iter | |
# normalize | |
# tol | |
# verbose | |
#[[CCA]] | |
# copy | |
# max_iter | |
# n_components | |
# scale | |
# tol | |
#[[DecisionTreeRegressor]] | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# presort | |
# random_state | |
# splitter | |
#[[ElasticNet]] | |
# alpha | |
# copy_X | |
# fit_intercept | |
# l1_ratio | |
# max_iter | |
# normalize | |
# positive | |
# precompute | |
# random_state | |
# selection | |
# tol | |
# warm_start | |
#[[ElasticNetCV]] | |
# alphas | |
# copy_X | |
# cv | |
# eps | |
# fit_intercept | |
# l1_ratio | |
# max_iter | |
# n_alphas | |
# n_jobs | |
# normalize | |
# positive | |
# precompute | |
# random_state | |
# selection | |
# tol | |
# verbose | |
#[[ExtraTreeRegressor]] | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# random_state | |
# splitter | |
#[[ExtraTreesRegressor]] | |
# bootstrap | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# n_estimators | |
# n_jobs | |
# oob_score | |
# random_state | |
# verbose | |
# warm_start | |
#[[GaussianProcess]] | |
# beta0 | |
# corr | |
# normalize | |
# nugget | |
# optimizer | |
# random_start | |
# random_state | |
# regr | |
# storage_mode | |
# theta0 | |
# thetaL | |
# thetaU | |
# verbose | |
#[[GaussianProcessRegressor]] | |
# alpha | |
# copy_X_train | |
# kernel | |
# n_restarts_optimizer | |
# normalize_y | |
# optimizer | |
# random_state | |
#[[GradientBoostingRegressor]] | |
# alpha | |
# criterion | |
# init | |
# learning_rate | |
# loss | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# n_estimators | |
# presort | |
# random_state | |
# subsample | |
# verbose | |
# warm_start | |
#[[HuberRegressor]] | |
# alpha | |
# epsilon | |
# fit_intercept | |
# max_iter | |
# tol | |
# warm_start | |
#[[KNeighborsRegressor]] | |
# algorithm | |
# leaf_size | |
# metric | |
# metric_params | |
# n_jobs | |
# n_neighbors | |
# p | |
# weights | |
#[[KernelRidge]] | |
# alpha | |
# coef0 | |
# degree | |
# gamma | |
# kernel | |
# kernel_params | |
#[[Lars]] | |
# copy_X | |
# eps | |
# fit_intercept | |
# fit_path | |
# n_nonzero_coefs | |
# normalize | |
# positive | |
# precompute | |
# verbose | |
#[[LarsCV]] | |
# copy_X | |
# cv | |
# eps | |
# fit_intercept | |
# max_iter | |
# max_n_alphas | |
# n_jobs | |
# normalize | |
# positive | |
# precompute | |
# verbose | |
#[[Lasso]] | |
# alpha | |
# copy_X | |
# fit_intercept | |
# max_iter | |
# normalize | |
# positive | |
# precompute | |
# random_state | |
# selection | |
# tol | |
# warm_start | |
#[[LassoCV]] | |
# alphas | |
# copy_X | |
# cv | |
# eps | |
# fit_intercept | |
# max_iter | |
# n_alphas | |
# n_jobs | |
# normalize | |
# positive | |
# precompute | |
# random_state | |
# selection | |
# tol | |
# verbose | |
#[[LassoLars]] | |
# alpha | |
# copy_X | |
# eps | |
# fit_intercept | |
# fit_path | |
# max_iter | |
# normalize | |
# positive | |
# precompute | |
# verbose | |
#[[LassoLarsCV]] | |
# copy_X | |
# cv | |
# eps | |
# fit_intercept | |
# max_iter | |
# max_n_alphas | |
# n_jobs | |
# normalize | |
# positive | |
# precompute | |
# verbose | |
#[[LassoLarsIC]] | |
# copy_X | |
# criterion | |
# eps | |
# fit_intercept | |
# max_iter | |
# normalize | |
# positive | |
# precompute | |
# verbose | |
#[[LinearRegression]] | |
# copy_X | |
# fit_intercept | |
# n_jobs | |
# normalize | |
#[[LinearSVR]] | |
# C | |
# dual | |
# epsilon | |
# fit_intercept | |
# intercept_scaling | |
# loss | |
# max_iter | |
# random_state | |
# tol | |
# verbose | |
#[[MLPRegressor]] | |
# activation | |
# alpha | |
# batch_size | |
# beta_1 | |
# beta_2 | |
# early_stopping | |
# epsilon | |
# hidden_layer_sizes | |
# learning_rate | |
# learning_rate_init | |
# max_iter | |
# momentum | |
# nesterovs_momentum | |
# power_t | |
# random_state | |
# shuffle | |
# solver | |
# tol | |
# validation_fraction | |
# verbose | |
# warm_start | |
#[[MultiTaskElasticNet]] | |
# alpha | |
# copy_X | |
# fit_intercept | |
# l1_ratio | |
# max_iter | |
# normalize | |
# random_state | |
# selection | |
# tol | |
# warm_start | |
#[[MultiTaskElasticNetCV]] | |
# alphas | |
# copy_X | |
# cv | |
# eps | |
# fit_intercept | |
# l1_ratio | |
# max_iter | |
# n_alphas | |
# n_jobs | |
# normalize | |
# random_state | |
# selection | |
# tol | |
# verbose | |
#[[MultiTaskLasso]] | |
# alpha | |
# copy_X | |
# fit_intercept | |
# max_iter | |
# normalize | |
# random_state | |
# selection | |
# tol | |
# warm_start | |
#[[MultiTaskLassoCV]] | |
# alphas | |
# copy_X | |
# cv | |
# eps | |
# fit_intercept | |
# max_iter | |
# n_alphas | |
# n_jobs | |
# normalize | |
# random_state | |
# selection | |
# tol | |
# verbose | |
#[[NuSVR]] | |
# C | |
# cache_size | |
# coef0 | |
# degree | |
# gamma | |
# kernel | |
# max_iter | |
# nu | |
# shrinking | |
# tol | |
# verbose | |
#[[OrthogonalMatchingPursuit]] | |
# fit_intercept | |
# n_nonzero_coefs | |
# normalize | |
# precompute | |
# tol | |
#[[OrthogonalMatchingPursuitCV]] | |
# copy | |
# cv | |
# fit_intercept | |
# max_iter | |
# n_jobs | |
# normalize | |
# verbose | |
#[[PLSCanonical]] | |
# algorithm | |
# copy | |
# max_iter | |
# n_components | |
# scale | |
# tol | |
#[[PLSRegression]] | |
# copy | |
# max_iter | |
# n_components | |
# scale | |
# tol | |
#[[PassiveAggressiveRegressor]] | |
# C | |
# average | |
# epsilon | |
# fit_intercept | |
# loss | |
# max_iter | |
# n_iter | |
# random_state | |
# shuffle | |
# tol | |
# verbose | |
# warm_start | |
#[[RANSACRegressor]] | |
# base_estimator | |
# is_data_valid | |
# is_model_valid | |
# loss | |
# max_skips | |
# max_trials | |
# min_samples | |
# random_state | |
# residual_metric | |
# residual_threshold | |
# stop_n_inliers | |
# stop_probability | |
# stop_score | |
#[[RadiusNeighborsRegressor]] | |
# algorithm | |
# leaf_size | |
# metric | |
# metric_params | |
# p | |
# radius | |
# weights | |
#[[RandomForestRegressor]] | |
# bootstrap | |
# criterion | |
# max_depth | |
# max_features | |
# max_leaf_nodes | |
# min_impurity_decrease | |
# min_impurity_split | |
# min_samples_leaf | |
# min_samples_split | |
# min_weight_fraction_leaf | |
# n_estimators | |
# n_jobs | |
# oob_score | |
# random_state | |
# verbose | |
# warm_start | |
#[[Ridge]] | |
# alpha | |
# copy_X | |
# fit_intercept | |
# max_iter | |
# normalize | |
# random_state | |
# solver | |
# tol | |
#[[RidgeCV]] | |
# alphas | |
# cv | |
# fit_intercept | |
# gcv_mode | |
# normalize | |
# scoring | |
# store_cv_values | |
#[[SGDRegressor]] | |
# alpha | |
# average | |
# epsilon | |
# eta0 | |
# fit_intercept | |
# l1_ratio | |
# learning_rate | |
# loss | |
# max_iter | |
# n_iter | |
# penalty | |
# power_t | |
# random_state | |
# shuffle | |
# tol | |
# verbose | |
# warm_start | |
#[[SVR]] | |
# C | |
# cache_size | |
# coef0 | |
# degree | |
# epsilon | |
# gamma | |
# kernel | |
# max_iter | |
# shrinking | |
# tol | |
# verbose | |
#[[TheilSenRegressor]] | |
# copy_X | |
# fit_intercept | |
# max_iter | |
# max_subpopulation | |
# n_jobs | |
# n_subsamples | |
# random_state | |
# tol | |
# verbose | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Replacing the idea of tests with DataSplits is a good idea. Also grouping the [FeatureSelection] bits helps a great deal.