Arun Mohan arunm8489

Data Scientist | ML engineer

arunm8489 / rnn19

Last active August 1, 2020 14:11

arunm8489 / rnn18

Created August 1, 2020 13:55

	# handling numeric features
	train_numeric = np.concatenate((X_train['price'].values.reshape(-1, 1),X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_train['quantity'].values.reshape(-1,1),X_train['presence_of_num'].values.reshape(-1, 1)),axis=1)
	test_numeric = np.concatenate((X_test['price'].values.reshape(-1, 1), X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1, 1),X_test['quantity'].values.reshape(-1,1),X_test['presence_of_num'].values.reshape(-1, 1)),axis=1)

	stndardscalar = StandardScaler()
	std_train_numeric = stndardscalar.fit_transform(train_numeric)
	std_test_numeric = stndardscalar.transform(test_numeric)

arunm8489 / rnn17

Created August 1, 2020 13:53

	# label encoding categorical features
	def label_encoding(col):
	label_encoder = LabelEncoder()
	train_cols = label_encoder.fit_transform(X_train[col])
	X_test[col] = X_test[col].apply(lambda x: 'unknown' if x not in label_encoder.classes_ else x)
	label_encoder.classes_ = np.append(label_encoder.classes_, 'unknown')
	test_cols = label_encoder.fit_transform(X_test[col])
	return train_cols, test_cols

arunm8489 / rnn16

Created August 1, 2020 13:52

	dff = pd.read_csv('final_df.csv')
	y = dff['project_is_approved']
	X = dff.drop(columns=['project_is_approved'])
	X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100, shuffle= True)
	print(X_train.shape)
	print(y_train.shape)
	print(X_test.shape)
	print(y_test.shape)

arunm8489 / rnn15

Created July 31, 2020 14:32

	df = data[['teacher_prefix','school_state','project_grade_category',
	'project_subject_categories','project_subject_subcategories','essay',
	'quantity','price','presence_of_num','teacher_number_of_previously_posted_projects','project_is_approved']]

	df.to_csv('final_df.csv',index=False)

arunm8489 / rnn14

Created July 31, 2020 14:30

	# merge two column text dataframe:
	data["essay"] = data["project_essay_1"].map(str) + data["project_essay_2"].map(str) + data["project_essay_3"].map(str) + data["project_essay_4"].map(str) + data['project_title'].map(str)
	processed_essays = preprocess_text(data['essay'].values)
	data['essay'] = processed_essays

arunm8489 / rnn13

Created July 31, 2020 14:28

	print("printing some random reviews")
	print(91, data['project_title'].values[91])
	print(3, data['project_title'].values[3])
	print(147, data['project_title'].values[147])

arunm8489 / rnn12

Created July 31, 2020 14:27

	processed_titles = preprocess_text(data['project_title'].values)
	data['project_title'] = processed_titles
	#randomly printing some titles
	print(9, processed_titles[91])
	print(3, processed_titles[3])
	print(147, processed_titles[147])

arunm8489 / rnn12

Last active July 31, 2020 14:24

	import re

	def decontracted(phrase):
	# specific
	phrase = re.sub(r"won't", "will not", phrase)
	phrase = re.sub(r"can\'t", "can not", phrase)

	# general
	phrase = re.sub(r"n\'t", " not", phrase)
	phrase = re.sub(r"\'re", " are", phrase)

arunm8489 / rnn11

Created July 31, 2020 14:08

	def presence_number(data):
	presence = any(char.isdigit() for char in data)
	if presence is True :
	return 1
	elif presence is False :
	return 0
	return 0


	data['presence_of_num'] = data['project_resource_summary'].map(lambda x : presence_number(x))