Skip to content

Instantly share code, notes, and snippets.

@djsegal
Created January 5, 2020 00:26
Show Gist options
  • Save djsegal/637d46a5ac04cc05832a3793c4b0299a to your computer and use it in GitHub Desktop.
Save djsegal/637d46a5ac04cc05832a3793c4b0299a to your computer and use it in GitHub Desktop.
cur_data = price_data.copy()
cur_data = cur_data.merge(rental_data, on="id")
cur_data = cur_data.merge(location_data, on="id")
cur_data = cur_data.merge(geo_data, on="id")
X_train, X_test, y_train, y_test = \
custom_train_test_split(cur_data)
pass_cols = ["is_brooklyn", "density"]
drop_cols = ["year", "geometry", "zipcode"]
one_hot_cols = ["month"]
poly_cols = rental_data.columns.drop("id").tolist()
geo_cols = geo_data.columns.drop("id").tolist()
mentioned_cols = [
"id", "price", *pass_cols,
*one_hot_cols, *drop_cols,
*poly_cols, *geo_cols
]
assert sorted(cur_data.columns) == sorted(mentioned_cols)
cur_one_hot = OneHotEncoder(categories="auto")
cur_poly_feats = PolynomialFeatures(
degree=2, include_bias=False, interaction_only=True
)
cur_poly_pipeline = Pipeline([
("reciprocal", ReciprocalFeatures()),
("polynomial", cur_poly_feats),
("cancel", VarianceThreshold()),
("clean", CleanFeatures()),
("box_cox", PowerTransformer(method="box-cox"))
])
cur_geo_pipeline = Pipeline([
("is_brooklyn", BooleanProduct("is_brooklyn")),
("cancel", VarianceThreshold()),
("power_transform", PowerTransformer())
])
cur_col_transformers = [
("one_hot", cur_one_hot, one_hot_cols),
("poly_feats", cur_poly_pipeline, poly_cols),
("geo_feats", cur_geo_pipeline, [*geo_cols, "is_brooklyn"]),
("passthrough", PassThroughTransformer(), pass_cols)
]
cur_transformer = ColumnTransformer(cur_col_transformers)
cur_selector_lasso = SelectFromModel(
LassoCV(cv=4, n_jobs=-1, max_iter=5e4, random_state=42),
threshold=1e-3
)
cur_selector_forest = SelectFromModel(
ExtraTreesRegressor(n_estimators=128, n_jobs=-1, max_depth=28, random_state=42),
threshold=1e-3
)
cur_selector = Pipeline([
("lasso_1", clone(cur_selector_lasso)),
("lasso_2", clone(cur_selector_lasso)),
("forest", cur_selector_forest),
("lasso_3", clone(cur_selector_lasso)),
("lasso_4", clone(cur_selector_lasso))
], verbose=True)
regressor_list = [
ElasticNetCV(cv=4, n_jobs=-1, max_iter=5e4, n_alphas=256, random_state=42),
BooleanForkRegressor(HuberRegressor(max_iter=500, tol=1e-4)),
RandomForestRegressor(n_estimators=48, n_jobs=-1, max_depth=20, random_state=42),
ExtraTreesRegressor(n_estimators=64, n_jobs=-1, max_depth=24, random_state=42),
AdaBoostRegressor(n_estimators=128, learning_rate=0.6, random_state=42),
XGBRegressor(objective='reg:squarederror', n_jobs=-1, n_estimators=256, random_state=42)
]
meta_regressor = RidgeCV(
alphas=np.logspace(
np.log10(0.01), np.log10(100), 21
)
)
cur_regressor = StackingRegressor(
regressors=regressor_list,
meta_regressor=meta_regressor,
verbose=True
)
cur_pipeline = Pipeline([
("transformer", cur_transformer),
("scalar", StandardScaler()),
("selector", cur_selector),
("regressor", cur_regressor)
])
cur_model = TransformedTargetRegressor(
regressor=cur_pipeline, func=np.log10,
inverse_func=lambda cur_log_val: 10 ** cur_log_val
)
cur_model.fit(X_train, y_train)
R_2 = cur_model.score(X_test, y_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment