This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# display some images for every different expression | |
import numpy as np | |
import seaborn as sns | |
from keras.preprocessing.image import load_img, img_to_array | |
import matplotlib.pyplot as plt | |
import os | |
# size of the image: 48*48 pixels | |
pic_size = 48 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PR curve | |
from sklearn.metrics import average_precision_score, precision_recall_curve | |
from sklearn.utils.fixes import signature | |
average_precision = average_precision_score(y_test, y_pred) | |
precision, recall, _ = precision_recall_curve(y_test, y_pred) | |
# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ROC curve | |
from sklearn.metrics import roc_curve, auc, roc_auc_score | |
import matplotlib.pyplot as plt | |
y_pred = [x[1] for x in rf.predict_proba(X_test)] | |
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label = 1) | |
roc_auc = auc(fpr, tpr) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# train a random forest classifier | |
rf = RandomForestClassifier(n_estimators = 100, random_state = 42) | |
rf.fit(X_train, y_train) | |
# show feature importance | |
feature_importances_df = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False) | |
feature_importances_df.head(20) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# feature selection | |
label = "is_bad_review" | |
ignore_cols = [label, "review", "review_clean"] | |
features = [c for c in reviews_df.columns if c not in ignore_cols] | |
# split the data into train and test | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(reviews_df[features], reviews_df[label], test_size = 0.20, random_state = 42) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# plot sentiment distribution for positive and negative reviews | |
import seaborn as sns | |
for x in [0, 1]: | |
subset = reviews_df[reviews_df['is_bad_review'] == x] | |
# Draw the density plot | |
if x == 0: | |
label = "Good reviews" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# lowest negative sentiment reviews (with more than 5 words) | |
reviews_df[reviews_df["nb_words"] >= 5].sort_values("neg", ascending = False)[["review", "neg"]].head(10) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# highest positive sentiment reviews (with more than 5 words) | |
reviews_df[reviews_df["nb_words"] >= 5].sort_values("pos", ascending = False)[["review", "pos"]].head(10) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wordcloud function | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
def show_wordcloud(data, title = None): | |
wordcloud = WordCloud( | |
background_color = 'white', | |
max_words = 200, | |
max_font_size = 40, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# show is_bad_review distribution | |
reviews_df["is_bad_review"].value_counts(normalize = True) |