Ravish Chawla ravishchawla

I am a Graduate Student in Machine Learning from Georgia Institute of Technology, studying Data Analytics, Visualization, and Engineering.

ravishchawla / starbucks_post_3.py

Created June 6, 2019 16:51

Starbucks post: Create transactions only dataframes

	transcript_profile = pd.merge(transcript, profile, left_on='person', right_on='id')

	# keep only records of actual transactions, and keep integer columns
	transaction_data_only = transcript_profile[transcript_profile.event_transaction == 1];
	transaction_data_only = transaction_data_only.select_dtypes(exclude=['object']);

	# separate into male and female dataframes
	transaction_data_f = transaction_data_only[transaction_data_only.gender_F == 1]
	transaction_data_m = transaction_data_only[transaction_data_only.gender_M == 1]

ravishchawla / starbucks_post_2.py

Last active June 6, 2019 17:36

	'''Cleaning the Portfolio dataset'''
	# Dummy-fy channel attributes
	portfolio_channels = portfolio['channels'].apply(lambda x: ' '.join(x)).str.get_dummies(' ');
	portfolio_channels.columns = ['channel_' + col for col in portfolio_channels.columns];

	# Dummy-fy offer type attributes
	portfolio_offertype = portfolio['offer_type'].str.get_dummies()
	portfolio_offertype.columns = ['offer_' + col for col in portfolio_offertype.columns];

	# Add dummy columns and drop existing

ravishchawla / starbucks_post_1.py

Created June 6, 2019 16:43

Starbucks post: Loading the data and creating visualizations

	# Data Loading
	portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
	profile = pd.read_json('data/profile.json', orient='records', lines=True)
	transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

	# Cross Plot Visualizations
	sns.pairplot(portfolio, hue='offer_type')

	sns.pairplot(profile.dropna(), hue='gender')

ravishchawla / airbnb_post_7.py

Created February 27, 2019 17:58

AirBnB post: PCA results

	principal_weights = pd.DataFrame(pca.components_,columns=l_X.columns)
	p_c_1 = principal_weights.iloc[0]
	print(p_c_1.sort_values()[0:10])
	print(p_c_1.sort_values()[-10:])

ravishchawla / airbnb_post_6.py

Created February 27, 2019 17:50

AirBnB post: PCA results

	pca = PCA(n_components=400, random_state=1024);
	pca.fit(l_X)

	listings_pca = pca.transform(l_X);
	l_X_p_train, l_X_p_test, l_y_p_train, l_y_p_test = train_test_split(listings_pca, l_y, test_size=0.33, random_state=1024)

	rf_classifier_2 = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=1024);
	rf_classifier_2.fit(l_X_p_train, l_y_p_train)

	l_y_p_pred = rf_classifier_2.predict(l_X_p_test);

ravishchawla / airbnb_post_5.py

Created February 27, 2019 17:43

AirBnB post: Train test split

	l_X, l_y = listings_cleaned.drop('price', axis=1), listings_cleaned['price'];
	l_X_train, l_X_test, l_y_train, l_y_test = train_test_split(l_X, l_y, test_size=0.33, random_state=1024);

	rf_classifier = RandomForestRegressor(n_estimators=400, criterion='mse', random_state=1024);
	rf_classifier.fit(l_X_train, l_y_train)

	l_y_pred = rf_classifier.predict(l_X_test);
	l_y_pred_tr = rf_classifier.predict(l_X_train);

	print(math.sqrt(mean_squared_error(l_y_test, l_y_pred)))

ravishchawla / airbnb_post_4.py

Created February 25, 2019 22:47

AirBnB post: Categorical data cleaning

	# Gathering the different types of Non-numerical features
	categorical_attributes = ['experiences_offered', 'host_location', 'host_response_time', 'host_neighbourhood', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'market', 'smart_location', 'country_code', 'country', 'property_type', 'room_type', 'bed_type', 'cancellation_policy'];
	full_text_attributes = ['name', 'summary', 'space', 'description', 'neighborhood_overview', 'interaction', 'house_rules', 'host_about']
	dropping_attributes = ['listing_url', 'picture_url', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'last_scraped', 'host_name', 'calendar_last_scraped', 'calendar_updated']
	date_attributes = ['host_since', 'first_review', 'last_review']
	bool_attributes = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'is_location_exact', 'has_availability', 'requires_license', 'instant_bookable', 'is_business_travel_ready', 'require_guest_profile_picture', 'require_guest_phone

ravishchawla / airbnb_post_3.py

Created February 25, 2019 22:21

AirBnB post: Cleaning the data

	miss_listings = listings.isnull().sum() / len(listings)

	miss_gr_05 = listings.columns[miss_listings > 0.5]
	listings = listings.drop(miss_gr_05, axis=1);

	miss_gr_03 = listings.columns[miss_listings > 0.3]
	miss_vals_03 = miss_listings > 0.3;
	print([col + ' ' + str(miss_listings[col]) for col in miss_gr_03])

	listings[['host_response_rate']] = listings['host_response_rate'].apply(lambda col: float(str(col).replace("%", "")))

ravishchawla / airbnb_post_2.py

Created February 25, 2019 20:10

AirBnB post: Data exploration

	def df_stats(df):
	print('Shape: ' , df.shape);

	missings = df.isnull().sum() / len(df);
	missing_vals = dict(zip(df.columns[missings > 0], missings[missings > 0]));
	print('# Columns with any missing elements : ' , [(w, missing_vals[w]) for w in sorted(missing_vals, key=missing_vals.get, reverse=True)])

	print();

	print('Listings: ');

ravishchawla / airbnb_post_1.py

Created February 25, 2019 20:07

AirBnB post: Imports and Data Loading

	import pandas as pd;
	import numpy as np;
	import matplotlib;
	import matplotlib.pyplot as plt;
	import seaborn as sns;
	from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, f1_score
	from IPython.display import display