Skip to content

Instantly share code, notes, and snippets.

@stanlee321
Last active March 27, 2020 19:03
Show Gist options
  • Save stanlee321/81a8fa13964d9d823b148997d2ac43d7 to your computer and use it in GitHub Desktop.
Save stanlee321/81a8fa13964d9d823b148997d2ac43d7 to your computer and use it in GitHub Desktop.
Plot Top N words from a column dataframe
import matplotlib.pyplot as plt
import seaborn as sns
# Aux class for plot frecuency
class PlotFrecuency:
"""
Plot Top N words in a given column dataframe
"""
def __init__(self, path_to_save):
self.path_to_save = path_to_save
self.fontTitle = {
# 'family': 'serif',
# 'color': 'darkred',
'weight': 'bold',
'size': 20,
}
self.fonntY = {
# 'family': 'serif',
# 'color': 'darkred',
'weight': 'bold',
'size': 15,
}
self.fontX = {
# 'family': 'serif',
# 'color': 'darkred',
'weight': 'bold',
'size': 15,
}
self.top20 = None
def get_sorted_tuits(self, counts, top_n=25):
sorted_twitts = sorted(counts.items(), key= lambda kv:(kv[1], kv[0]))[::-1][:top_n]
for i, (k,v) in enumerate( sorted_twitts[0:]):
print(str(i) + "-", f"{self.key_name}: ", k, " |||", f"{self.key_value}: ", v)
return sorted_twitts
def create_df_counts(self, sorted_twitts):
df_x = pd.DataFrame(sorted_twitts, columns=["label", "count"])
df_x["index"] = df_x.index
df_x.index = list(df_x["label"])
df_x = df_x.sort_values(['count'], ascending=False)
return df_x
def get_uniques_and_user_names(self, df):
#user_names = df["commenter"].apply(self.deEmojify)
user_names = df[self.target_column]
#user_names = user_names.str.replace(" ", "")
self.uniques = user_names.unique()
return self.uniques, user_names
def plot_word_count(self, df, titley='Number of Comments', titlex="Names", title="Message frec"):
plt.figure(figsize=(10,6))
ax = sns.barplot(x="index", y="count", data=df, order=df['index'])
ax.set_title(title, fontdict = self.fontTitle)
ax.set_xlabel(titlex, fontdict = self.fontX)
ax.set_ylabel(titley, fontdict = self.fontX)
# y axis values font size
ax.tick_params(axis='y', labelsize=20)
# Grid
ax.grid(linestyle='--', linewidth=1)
labels = df["count"]
rects = ax.patches
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width() / 2, height, label,
ha='center', va='bottom', size = 15 )
ax.set_xticklabels(df['label'], rotation='vertical', fontsize=20)
plt.savefig(self.path_to_save + "/" + self.title + ".jpg", format="jpg", bbox_inches = 'tight')
def create_sorted_counts(self, user_names, top_n=25):
# Count for N
counts = self.get_counts(user_names)
sorted_counts = self.get_sorted_tuits(counts, top_n = top_n)
return sorted_counts
def get_counts(self, user_names):
counts = {}
for u in user_names:
counts[u] = []
for u in user_names:
counts[u].append(u)
for k,v in counts.items():
counts[k] = len(v)
return counts
def create_top_N_users(self, sorted_tuits, n=21):
# Word Frecuency for N
df_x = self.create_df_counts(sorted_tuits)
# Plot For N
topN = df_x.iloc[0:n]
return topN
def main(self, df, top_n = 21, target_column = "Departamento",
name="Departamento",
value="Frecuencia",
title_x="Departamento",
title_y="Frecuencia",
title="Acciones por departamento"):
# Set some parameters for the plot
self.target_column = target_column
self.key_name = name
self.key_value = value
self.title_x = title_x
self.title_y = title_y
self.title = title
self.uniques, self.user_names = self.get_uniques_and_user_names(df)
self.sorted_counts = self.create_sorted_counts(self.user_names, top_n=top_n)
self.top20 = self.create_top_N_users(self.sorted_counts, n=top_n)
self.plot_word_count(self.top20,
titley=self.title_y,
titlex=self.title_x,
title=self.title)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment