jpotts18 · September 23, 2016 17:51
diff --git a/model.conf b/model.conf
 // survival        Survival (0 = No; 1 = Yes)
 // pclass          Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
 // name            Name
 // sex             Sex
 // age             Age
 // sibsp           Number of Siblings/Spouses Aboard
 // parch           Number of Parents/Children Aboard
 // ticket          Ticket Number
 // fare            Passenger Fare
 // cabin           Cabin
 // embarked        Port of Embarkation


 // Binary
 // Multiclass
 // Regression

 // What Type of model are you interested in creating (classification, regression)
 classification {
  // which field is our labeled answer set
  target: 'survival',
  // what models should we try
  models: [
    'KNeighborsClassifier', 
    'SVC',
    'DecisionTreeClassifier',
    'RandomForestClassifier',
    'AdaBoostClassifier',
    'GaussianNB'
  ],
  grid_search : {
    'loss': 'rmse'
  }
 }

 classification {
  target: 'survival',
  models: [
    'LogisticRegression',
    'RandomForest',
    'GaussianNB'
  ],
  // bagging
  // boosting
  // voting
  voting: {
    // majority
    style: 'hard',
    weights [2,1,2]
  }
  gridsearch {
    cross_validations: 5
  }
 }
diff --git a/transform.conf b/transform.conf
 // titanic data set
 // survival        Survival (0 = No; 1 = Yes)
 // pclass          Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
 // name            Name
 // sex             Sex
 // age             Age
 // sibsp           Number of Siblings/Spouses Aboard
 // parch           Number of Parents/Children Aboard
 // ticket          Ticket Number
 // fare            Passenger Fare
 // cabin           Cabin
 // embarked        Port of Embarkation

 // Impute: 
 // Imputation. For various reasons, many real world datasets contain missing values, 
 // often encoded as blanks, NaNs or other placeholders. Such datasets however are 
 // incompatible with scikit-learn estimators which assume that all values in an array 
 // are numerical, and that all have and hold meaning.
 // Note: This must be done before scaling and other
 impute {
  // If a null value is found replace with the mean
  mean: ['age','fare'],
  // If a null value is found replace with a default
  default: ['embarked', 'NYC'] 
 }

 // Scale:
 // Standardization, or mean removal and variance scaling
 // Normalization: Normalization is the process of scaling individual samples to have unit norm. 

 scale {
  // Convert the following fields into standard deviations from the mean
  zstandard : ['age', 'sibsp', 'parch', 'fare']
  // replace columns with standardized values
  replace: true 
 }

 // Encode:
 // Binarization http://scikit-learn.org/stable/modules/preprocessing.html#binarization
 // Encoding Categorical Features
 encode {
  // create a sex_male, sex_female, embarked_nyc, embarked_x, embarked_y, ... fields
  onehot: ['sex','embarked'],
  // bin the 
  discretize: {
    fields: ['age'],
    method: 'binning',
    by: 10,
  }
  // Discretization might need to be a concern in the ingestion phase??
 }

 generate {
  polynomial: ['sibsp','parch']
 }
	// survival Survival (0 = No; 1 = Yes)
	// pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
	// name Name
	// sex Sex
	// age Age
	// sibsp Number of Siblings/Spouses Aboard
	// parch Number of Parents/Children Aboard
	// ticket Ticket Number
	// fare Passenger Fare
	// cabin Cabin
	// embarked Port of Embarkation


	// Binary
	// Multiclass
	// Regression

	// What Type of model are you interested in creating (classification, regression)
	classification {
	// which field is our labeled answer set
	target: 'survival',
	// what models should we try
	models: [
	'KNeighborsClassifier',
	'SVC',
	'DecisionTreeClassifier',
	'RandomForestClassifier',
	'AdaBoostClassifier',
	'GaussianNB'
	],
	grid_search : {
	'loss': 'rmse'
	}
	}

	classification {
	target: 'survival',
	models: [
	'LogisticRegression',
	'RandomForest',
	'GaussianNB'
	],
	// bagging
	// boosting
	// voting
	voting: {
	// majority
	style: 'hard',
	weights [2,1,2]
	}
	gridsearch {
	cross_validations: 5
	}
	}
	// titanic data set
	// survival Survival (0 = No; 1 = Yes)
	// pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
	// name Name
	// sex Sex
	// age Age
	// sibsp Number of Siblings/Spouses Aboard
	// parch Number of Parents/Children Aboard
	// ticket Ticket Number
	// fare Passenger Fare
	// cabin Cabin
	// embarked Port of Embarkation

	// Impute:
	// Imputation. For various reasons, many real world datasets contain missing values,
	// often encoded as blanks, NaNs or other placeholders. Such datasets however are
	// incompatible with scikit-learn estimators which assume that all values in an array
	// are numerical, and that all have and hold meaning.
	// Note: This must be done before scaling and other
	impute {
	// If a null value is found replace with the mean
	mean: ['age','fare'],
	// If a null value is found replace with a default
	default: ['embarked', 'NYC']
	}

	// Scale:
	// Standardization, or mean removal and variance scaling
	// Normalization: Normalization is the process of scaling individual samples to have unit norm.

	scale {
	// Convert the following fields into standard deviations from the mean
	zstandard : ['age', 'sibsp', 'parch', 'fare']
	// replace columns with standardized values
	replace: true
	}

	// Encode:
	// Binarization http://scikit-learn.org/stable/modules/preprocessing.html#binarization
	// Encoding Categorical Features
	encode {
	// create a sex_male, sex_female, embarked_nyc, embarked_x, embarked_y, ... fields
	onehot: ['sex','embarked'],
	// bin the
	discretize: {
	fields: ['age'],
	method: 'binning',
	by: 10,
	}
	// Discretization might need to be a concern in the ingestion phase??
	}

	generate {
	polynomial: ['sibsp','parch']
	}