Skip to content

Instantly share code, notes, and snippets.

View jonathanoheix's full-sized avatar

jonathanoheix

  • Macif-Mutualité
  • France
View GitHub Profile
# display some images for every different expression
import numpy as np
import seaborn as sns
from keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt
import os
# size of the image: 48*48 pixels
pic_size = 48
# PR curve
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.utils.fixes import signature
average_precision = average_precision_score(y_test, y_pred)
precision, recall, _ = precision_recall_curve(y_test, y_pred)
# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
# ROC curve
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
y_pred = [x[1] for x in rf.predict_proba(X_test)]
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label = 1)
roc_auc = auc(fpr, tpr)
# train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)
# show feature importance
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)
# feature selection
label = "is_bad_review"
ignore_cols = [label, "review", "review_clean"]
features = [c for c in reviews_df.columns if c not in ignore_cols]
# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews_df[features], reviews_df[label], test_size = 0.20, random_state = 42)
# plot sentiment distribution for positive and negative reviews
import seaborn as sns
for x in [0, 1]:
subset = reviews_df[reviews_df['is_bad_review'] == x]
# Draw the density plot
if x == 0:
label = "Good reviews"
# lowest negative sentiment reviews (with more than 5 words)
reviews_df[reviews_df["nb_words"] >= 5].sort_values("neg", ascending = False)[["review", "neg"]].head(10)
# highest positive sentiment reviews (with more than 5 words)
reviews_df[reviews_df["nb_words"] >= 5].sort_values("pos", ascending = False)[["review", "pos"]].head(10)
# wordcloud function
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def show_wordcloud(data, title = None):
wordcloud = WordCloud(
background_color = 'white',
max_words = 200,
max_font_size = 40,
# show is_bad_review distribution
reviews_df["is_bad_review"].value_counts(normalize = True)