Created
September 23, 2016 17:51
-
-
Save jpotts18/f004a865b0391df17955a528a15abf6c to your computer and use it in GitHub Desktop.
ML Database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// survival Survival (0 = No; 1 = Yes) | |
// pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) | |
// name Name | |
// sex Sex | |
// age Age | |
// sibsp Number of Siblings/Spouses Aboard | |
// parch Number of Parents/Children Aboard | |
// ticket Ticket Number | |
// fare Passenger Fare | |
// cabin Cabin | |
// embarked Port of Embarkation | |
// Binary | |
// Multiclass | |
// Regression | |
// What Type of model are you interested in creating (classification, regression) | |
classification { | |
// which field is our labeled answer set | |
target: 'survival', | |
// what models should we try | |
models: [ | |
'KNeighborsClassifier', | |
'SVC', | |
'DecisionTreeClassifier', | |
'RandomForestClassifier', | |
'AdaBoostClassifier', | |
'GaussianNB' | |
], | |
grid_search : { | |
'loss': 'rmse' | |
} | |
} | |
classification { | |
target: 'survival', | |
models: [ | |
'LogisticRegression', | |
'RandomForest', | |
'GaussianNB' | |
], | |
// bagging | |
// boosting | |
// voting | |
voting: { | |
// majority | |
style: 'hard', | |
weights [2,1,2] | |
} | |
gridsearch { | |
cross_validations: 5 | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// titanic data set | |
// survival Survival (0 = No; 1 = Yes) | |
// pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) | |
// name Name | |
// sex Sex | |
// age Age | |
// sibsp Number of Siblings/Spouses Aboard | |
// parch Number of Parents/Children Aboard | |
// ticket Ticket Number | |
// fare Passenger Fare | |
// cabin Cabin | |
// embarked Port of Embarkation | |
// Impute: | |
// Imputation. For various reasons, many real world datasets contain missing values, | |
// often encoded as blanks, NaNs or other placeholders. Such datasets however are | |
// incompatible with scikit-learn estimators which assume that all values in an array | |
// are numerical, and that all have and hold meaning. | |
// Note: This must be done before scaling and other | |
impute { | |
// If a null value is found replace with the mean | |
mean: ['age','fare'], | |
// If a null value is found replace with a default | |
default: ['embarked', 'NYC'] | |
} | |
// Scale: | |
// Standardization, or mean removal and variance scaling | |
// Normalization: Normalization is the process of scaling individual samples to have unit norm. | |
scale { | |
// Convert the following fields into standard deviations from the mean | |
zstandard : ['age', 'sibsp', 'parch', 'fare'] | |
// replace columns with standardized values | |
replace: true | |
} | |
// Encode: | |
// Binarization http://scikit-learn.org/stable/modules/preprocessing.html#binarization | |
// Encoding Categorical Features | |
encode { | |
// create a sex_male, sex_female, embarked_nyc, embarked_x, embarked_y, ... fields | |
onehot: ['sex','embarked'], | |
// bin the | |
discretize: { | |
fields: ['age'], | |
method: 'binning', | |
by: 10, | |
} | |
// Discretization might need to be a concern in the ingestion phase?? | |
} | |
generate { | |
polynomial: ['sibsp','parch'] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment