Ravish Chawla ravishchawla

I am a Graduate Student in Machine Learning from Georgia Institute of Technology, studying Data Analytics, Visualization, and Engineering.

ravishchawla / medium_lstm_encode_3.py

Last active June 27, 2018 18:59

	# Use a Keras Tokenizer and fit on the sentences

	tokenizer = Tokenizer();
	tokenizer.fit_on_texts(sentences);
	text_sequences = np.array(tokenizer.texts_to_sequences(sentences));
	sequence_dict = tokenizer.word_index;
	word_dict = dict((num, val) for (val, num) in sequence_dict.items());

	# We get a map of encoding-to-word in sequence_dict

ravishchawla / medium_lstm_pad_4.py

Created June 27, 2018 19:03

	# Truncate and Pad reviews at a Maximum cap of 60 words.
	max_cap = 60;
	X = pad_sequences(reviews_encoded, maxlen=max_cap, truncating='post')

	# Obtain a One-hot Y array for each review label.
	Y = np.array([[0,1] if '0' in label else [1,0] for label in labels])

	# Get a randomized sequence of positions to shuffle reviews
	np.random.seed(1024);
	random_posits = np.arange(len(X))

ravishchawla / medium_lstm_model_5.py

Last active June 27, 2018 19:17

	model = Sequential();
	model.add(Embedding(len(word_dict), max_cap, input_length=max_cap));
	model.add(LSTM(100, return_sequences=True));
	model.add(LSTM(100));
	model.add(Dense(100, activation='relu'));
	model.add(Dense(2, activation='softmax'));
	print(model.summary());

	optimizer = Adam(lr=0.001, decay=0.0001);
	model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

ravishchawla / model_lstm_model_7.py

Created June 27, 2018 19:36

	model = Sequential();
	model.add(Embedding(len(word_dict), max_cap, input_length=max_cap));
	model.add(LSTM(60, return_sequences=True, recurrent_dropout=0.5));
	model.add(Dropout(0.5))
	model.add(LSTM(60, recurrent_dropout=0.5));
	model.add(Dense(60, activation='relu'));
	model.add(Dense(2, activation='softmax'));
	print(model.summary());

	optimizer = Adam(lr=0.01, decay=0.001);

ravishchawla / medium_lstm_glove_8.py

Last active June 27, 2018 19:58

	embeddings_index = dict();
	with open('data/glove.6B.100d.txt') as f:
	for line in f:
	values = line.split();
	word = values[0];
	coefs = np.asarray(values[1:], dtype='float32');
	embeddings_index[word] = coefs;

	vocab_size = len(sequence_dict);
	embeddings_matrix = np.zeros((vocab_size, 100));

ravishchawla / medium_lstm_test_9.py

Last active June 27, 2018 21:38

	# Obtain predictions
	predictions = model.predict_classes(X_test)

	# Convert Y_test to the same format as predictions
	actuals = [0 if y[0] == 1 else 1 for y in Y_test];

	# Use SkLearn's Metrics module
	accuracy_score(predictions, actuals)

ravishchawla / airbnb_post_1.py

Created February 25, 2019 20:07

AirBnB post: Imports and Data Loading

	import pandas as pd;
	import numpy as np;
	import matplotlib;
	import matplotlib.pyplot as plt;
	import seaborn as sns;
	from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, f1_score
	from IPython.display import display

ravishchawla / airbnb_post_2.py

Created February 25, 2019 20:10

AirBnB post: Data exploration

	def df_stats(df):
	print('Shape: ' , df.shape);

	missings = df.isnull().sum() / len(df);
	missing_vals = dict(zip(df.columns[missings > 0], missings[missings > 0]));
	print('# Columns with any missing elements : ' , [(w, missing_vals[w]) for w in sorted(missing_vals, key=missing_vals.get, reverse=True)])

	print();

	print('Listings: ');

ravishchawla / airbnb_post_3.py

Created February 25, 2019 22:21

AirBnB post: Cleaning the data

	miss_listings = listings.isnull().sum() / len(listings)

	miss_gr_05 = listings.columns[miss_listings > 0.5]
	listings = listings.drop(miss_gr_05, axis=1);

	miss_gr_03 = listings.columns[miss_listings > 0.3]
	miss_vals_03 = miss_listings > 0.3;
	print([col + ' ' + str(miss_listings[col]) for col in miss_gr_03])

	listings[['host_response_rate']] = listings['host_response_rate'].apply(lambda col: float(str(col).replace("%", "")))

ravishchawla / airbnb_post_4.py

Created February 25, 2019 22:47

AirBnB post: Categorical data cleaning

	# Gathering the different types of Non-numerical features
	categorical_attributes = ['experiences_offered', 'host_location', 'host_response_time', 'host_neighbourhood', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'market', 'smart_location', 'country_code', 'country', 'property_type', 'room_type', 'bed_type', 'cancellation_policy'];
	full_text_attributes = ['name', 'summary', 'space', 'description', 'neighborhood_overview', 'interaction', 'house_rules', 'host_about']
	dropping_attributes = ['listing_url', 'picture_url', 'host_url', 'host_thumbnail_url', 'host_picture_url', 'last_scraped', 'host_name', 'calendar_last_scraped', 'calendar_updated']
	date_attributes = ['host_since', 'first_review', 'last_review']
	bool_attributes = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'is_location_exact', 'has_availability', 'requires_license', 'instant_bookable', 'is_business_travel_ready', 'require_guest_profile_picture', 'require_guest_phone