Ravish Chawla ravishchawla

I am a Graduate Student in Machine Learning from Georgia Institute of Technology, studying Data Analytics, Visualization, and Engineering.

ravishchawla / airbnb_post_5.py

Created February 27, 2019 17:43

AirBnB post: Train test split

	l_X, l_y = listings_cleaned.drop('price', axis=1), listings_cleaned['price'];
	l_X_train, l_X_test, l_y_train, l_y_test = train_test_split(l_X, l_y, test_size=0.33, random_state=1024);

	rf_classifier = RandomForestRegressor(n_estimators=400, criterion='mse', random_state=1024);
	rf_classifier.fit(l_X_train, l_y_train)

	l_y_pred = rf_classifier.predict(l_X_test);
	l_y_pred_tr = rf_classifier.predict(l_X_train);

	print(math.sqrt(mean_squared_error(l_y_test, l_y_pred)))

ravishchawla / airbnb_post_6.py

Created February 27, 2019 17:50

AirBnB post: PCA results

	pca = PCA(n_components=400, random_state=1024);
	pca.fit(l_X)

	listings_pca = pca.transform(l_X);
	l_X_p_train, l_X_p_test, l_y_p_train, l_y_p_test = train_test_split(listings_pca, l_y, test_size=0.33, random_state=1024)

	rf_classifier_2 = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=1024);
	rf_classifier_2.fit(l_X_p_train, l_y_p_train)

	l_y_p_pred = rf_classifier_2.predict(l_X_p_test);

ravishchawla / airbnb_post_7.py

Created February 27, 2019 17:58

AirBnB post: PCA results

	principal_weights = pd.DataFrame(pca.components_,columns=l_X.columns)
	p_c_1 = principal_weights.iloc[0]
	print(p_c_1.sort_values()[0:10])
	print(p_c_1.sort_values()[-10:])

ravishchawla / starbucks_post_1.py

Created June 6, 2019 16:43

Starbucks post: Loading the data and creating visualizations

	# Data Loading
	portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
	profile = pd.read_json('data/profile.json', orient='records', lines=True)
	transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

	# Cross Plot Visualizations
	sns.pairplot(portfolio, hue='offer_type')

	sns.pairplot(profile.dropna(), hue='gender')

ravishchawla / starbucks_post_2.py

Last active June 6, 2019 17:36

	'''Cleaning the Portfolio dataset'''
	# Dummy-fy channel attributes
	portfolio_channels = portfolio['channels'].apply(lambda x: ' '.join(x)).str.get_dummies(' ');
	portfolio_channels.columns = ['channel_' + col for col in portfolio_channels.columns];

	# Dummy-fy offer type attributes
	portfolio_offertype = portfolio['offer_type'].str.get_dummies()
	portfolio_offertype.columns = ['offer_' + col for col in portfolio_offertype.columns];

	# Add dummy columns and drop existing

ravishchawla / starbucks_post_3.py

Created June 6, 2019 16:51

Starbucks post: Create transactions only dataframes

	transcript_profile = pd.merge(transcript, profile, left_on='person', right_on='id')

	# keep only records of actual transactions, and keep integer columns
	transaction_data_only = transcript_profile[transcript_profile.event_transaction == 1];
	transaction_data_only = transaction_data_only.select_dtypes(exclude=['object']);

	# separate into male and female dataframes
	transaction_data_f = transaction_data_only[transaction_data_only.gender_F == 1]
	transaction_data_m = transaction_data_only[transaction_data_only.gender_M == 1]

ravishchawla / starbucks_post_4.py

Created June 6, 2019 17:36

	'''Cleaning the Profile dataset'''
	profile = profile.dropna(axis=0, subset=['gender', 'income']);
	profile_gender = profile['gender'].str.get_dummies()
	profile_gender.columns = ['gender_' + col for col in profile_gender.columns];
	# Separate date attributes into year, month, and day, converting to integers.
	profile_date = profile['became_member_on'];

	profile_year = profile_date.apply(lambda d: str(d)).str[0:4].astype('int').rename('member_year');
	profile_month = profile_date.apply(lambda d: str(d)).str[4:6].astype('int').rename('member_month');
	profile_day = profile_date.apply(lambda d: str(d)).str[6:8].astype('int').rename('member_day');

ravishchawla / starbucks_post_5.py

Created June 6, 2019 17:36

	'''Cleaning the Transcript dataset'''
	transcript_event = transcript['event'].str.get_dummies();
	transcript_event.columns = ['event_' + '_'.join(col.split(' ')) for col in transcript_event.columns];

	# standardize "offer id" column names
	def transcript_value_clean(x_dict):
	if 'offer id' in x_dict:
	x_dict['offer_id'] = x_dict['offer id'];
	del x_dict['offer id'];
	return x_dict;

ravishchawla / starbucks_post_6.py

Created June 6, 2019 17:39

	transcript_portfolio = pd.merge(transcript, portfolio, left_on='offer_id', right_on='id', how='left')
	transcript_by_group = transcript_portfolio.groupby(['person', 'offer_id'])

	completion_details = [];

	'''
	Go through each group in the transaction grouping. Because iterating can be slow,
	we will use vectorized operations inside the main loop.
	'''
	for i, g in transcript_by_group:

ravishchawla / starbucks_post_7.py

Created June 20, 2019 15:37

	params = {'n_estimators' : [10, 50, 100], 'max_depth' : [5, 10, 30, 80], \
	'max_features': [1, 3, 8, 15], 'min_samples_split': [3, 5, 10, 30, 50, 100]}

	g_rfm = RandomForestRegressor(random_state=1024);
	g_src = GridSearchCV(g_rfm, params, verbose=10, cv=5, scoring='r2');
	g_src.fit(X_train, y_train)

	print(g_src.best_params_)

	tuned_rf_model = RandomForestRegressor(max_depth=30, max_features=3, min_samples_split=100, n_estimators=100);