This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
l_X, l_y = listings_cleaned.drop('price', axis=1), listings_cleaned['price']; | |
l_X_train, l_X_test, l_y_train, l_y_test = train_test_split(l_X, l_y, test_size=0.33, random_state=1024); | |
rf_classifier = RandomForestRegressor(n_estimators=400, criterion='mse', random_state=1024); | |
rf_classifier.fit(l_X_train, l_y_train) | |
l_y_pred = rf_classifier.predict(l_X_test); | |
l_y_pred_tr = rf_classifier.predict(l_X_train); | |
print(math.sqrt(mean_squared_error(l_y_test, l_y_pred))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pca = PCA(n_components=400, random_state=1024); | |
pca.fit(l_X) | |
listings_pca = pca.transform(l_X); | |
l_X_p_train, l_X_p_test, l_y_p_train, l_y_p_test = train_test_split(listings_pca, l_y, test_size=0.33, random_state=1024) | |
rf_classifier_2 = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=1024); | |
rf_classifier_2.fit(l_X_p_train, l_y_p_train) | |
l_y_p_pred = rf_classifier_2.predict(l_X_p_test); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
principal_weights = pd.DataFrame(pca.components_,columns=l_X.columns) | |
p_c_1 = principal_weights.iloc[0] | |
print(p_c_1.sort_values()[0:10]) | |
print(p_c_1.sort_values()[-10:]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data Loading | |
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True) | |
profile = pd.read_json('data/profile.json', orient='records', lines=True) | |
transcript = pd.read_json('data/transcript.json', orient='records', lines=True) | |
# Cross Plot Visualizations | |
sns.pairplot(portfolio, hue='offer_type') | |
sns.pairplot(profile.dropna(), hue='gender') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Cleaning the *Portfolio* dataset''' | |
# Dummy-fy channel attributes | |
portfolio_channels = portfolio['channels'].apply(lambda x: ' '.join(x)).str.get_dummies(' '); | |
portfolio_channels.columns = ['channel_' + col for col in portfolio_channels.columns]; | |
# Dummy-fy offer type attributes | |
portfolio_offertype = portfolio['offer_type'].str.get_dummies() | |
portfolio_offertype.columns = ['offer_' + col for col in portfolio_offertype.columns]; | |
# Add dummy columns and drop existing |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
transcript_profile = pd.merge(transcript, profile, left_on='person', right_on='id') | |
# keep only records of actual transactions, and keep integer columns | |
transaction_data_only = transcript_profile[transcript_profile.event_transaction == 1]; | |
transaction_data_only = transaction_data_only.select_dtypes(exclude=['object']); | |
# separate into male and female dataframes | |
transaction_data_f = transaction_data_only[transaction_data_only.gender_F == 1] | |
transaction_data_m = transaction_data_only[transaction_data_only.gender_M == 1] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Cleaning the *Profile* dataset''' | |
profile = profile.dropna(axis=0, subset=['gender', 'income']); | |
profile_gender = profile['gender'].str.get_dummies() | |
profile_gender.columns = ['gender_' + col for col in profile_gender.columns]; | |
# Separate date attributes into year, month, and day, converting to integers. | |
profile_date = profile['became_member_on']; | |
profile_year = profile_date.apply(lambda d: str(d)).str[0:4].astype('int').rename('member_year'); | |
profile_month = profile_date.apply(lambda d: str(d)).str[4:6].astype('int').rename('member_month'); | |
profile_day = profile_date.apply(lambda d: str(d)).str[6:8].astype('int').rename('member_day'); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Cleaning the *Transcript* dataset''' | |
transcript_event = transcript['event'].str.get_dummies(); | |
transcript_event.columns = ['event_' + '_'.join(col.split(' ')) for col in transcript_event.columns]; | |
# standardize "offer id" column names | |
def transcript_value_clean(x_dict): | |
if 'offer id' in x_dict: | |
x_dict['offer_id'] = x_dict['offer id']; | |
del x_dict['offer id']; | |
return x_dict; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
transcript_portfolio = pd.merge(transcript, portfolio, left_on='offer_id', right_on='id', how='left') | |
transcript_by_group = transcript_portfolio.groupby(['person', 'offer_id']) | |
completion_details = []; | |
''' | |
Go through each group in the transaction grouping. Because iterating can be slow, | |
we will use vectorized operations inside the main loop. | |
''' | |
for i, g in transcript_by_group: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
params = {'n_estimators' : [10, 50, 100], 'max_depth' : [5, 10, 30, 80], \ | |
'max_features': [1, 3, 8, 15], 'min_samples_split': [3, 5, 10, 30, 50, 100]} | |
g_rfm = RandomForestRegressor(random_state=1024); | |
g_src = GridSearchCV(g_rfm, params, verbose=10, cv=5, scoring='r2'); | |
g_src.fit(X_train, y_train) | |
print(g_src.best_params_) | |
tuned_rf_model = RandomForestRegressor(max_depth=30, max_features=3, min_samples_split=100, n_estimators=100); |