jonathanoheix

32 followers · 12 following

Macif-Mutualité
France

View GitHub Profile

Recently created

Least recently created

Recently updated

Least recently updated

jonathanoheix / face1.py

Created January 4, 2019 08:25

	# display some images for every different expression

	import numpy as np
	import seaborn as sns
	from keras.preprocessing.image import load_img, img_to_array
	import matplotlib.pyplot as plt
	import os

	# size of the image: 48*48 pixels
	pic_size = 48

jonathanoheix / nlp17.py

Created December 18, 2018 09:53

	# PR curve

	from sklearn.metrics import average_precision_score, precision_recall_curve
	from sklearn.utils.fixes import signature

	average_precision = average_precision_score(y_test, y_pred)

	precision, recall, _ = precision_recall_curve(y_test, y_pred)

	# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument

jonathanoheix / nlp16.py

Created December 18, 2018 09:53

	# ROC curve

	from sklearn.metrics import roc_curve, auc, roc_auc_score
	import matplotlib.pyplot as plt

	y_pred = [x[1] for x in rf.predict_proba(X_test)]
	fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label = 1)

	roc_auc = auc(fpr, tpr)

jonathanoheix / nlp15.py

Created December 18, 2018 09:53

	# train a random forest classifier
	rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
	rf.fit(X_train, y_train)

	# show feature importance
	feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
	feature_importances_df.head(20)

jonathanoheix / nlp14.py

Created December 18, 2018 09:53

	# feature selection
	label = "is_bad_review"
	ignore_cols = [label, "review", "review_clean"]
	features = [c for c in reviews_df.columns if c not in ignore_cols]

	# split the data into train and test
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(reviews_df[features], reviews_df[label], test_size = 0.20, random_state = 42)

jonathanoheix / nlp13.py

Created December 18, 2018 09:52

	# plot sentiment distribution for positive and negative reviews

	import seaborn as sns

	for x in [0, 1]:
	subset = reviews_df[reviews_df['is_bad_review'] == x]

	# Draw the density plot
	if x == 0:
	label = "Good reviews"

jonathanoheix / nlp12.py

Created December 18, 2018 09:52

	# lowest negative sentiment reviews (with more than 5 words)
	reviews_df[reviews_df["nb_words"] >= 5].sort_values("neg", ascending = False)[["review", "neg"]].head(10)

jonathanoheix / nlp11.py

Created December 18, 2018 09:51

	# highest positive sentiment reviews (with more than 5 words)
	reviews_df[reviews_df["nb_words"] >= 5].sort_values("pos", ascending = False)[["review", "pos"]].head(10)

jonathanoheix / nlp10.py

Created December 18, 2018 09:51

	# wordcloud function

	from wordcloud import WordCloud
	import matplotlib.pyplot as plt

	def show_wordcloud(data, title = None):
	wordcloud = WordCloud(
	background_color = 'white',
	max_words = 200,
	max_font_size = 40,

jonathanoheix / nlp9.py

Created December 18, 2018 09:51

	# show is_bad_review distribution
	reviews_df["is_bad_review"].value_counts(normalize = True)