Last active
March 27, 2020 19:03
-
-
Save stanlee321/81a8fa13964d9d823b148997d2ac43d7 to your computer and use it in GitHub Desktop.
Plot Top N words from a column dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Aux class for plot frecuency | |
| class PlotFrecuency: | |
| """ | |
| Plot Top N words in a given column dataframe | |
| """ | |
| def __init__(self, path_to_save): | |
| self.path_to_save = path_to_save | |
| self.fontTitle = { | |
| # 'family': 'serif', | |
| # 'color': 'darkred', | |
| 'weight': 'bold', | |
| 'size': 20, | |
| } | |
| self.fonntY = { | |
| # 'family': 'serif', | |
| # 'color': 'darkred', | |
| 'weight': 'bold', | |
| 'size': 15, | |
| } | |
| self.fontX = { | |
| # 'family': 'serif', | |
| # 'color': 'darkred', | |
| 'weight': 'bold', | |
| 'size': 15, | |
| } | |
| self.top20 = None | |
| def get_sorted_tuits(self, counts, top_n=25): | |
| sorted_twitts = sorted(counts.items(), key= lambda kv:(kv[1], kv[0]))[::-1][:top_n] | |
| for i, (k,v) in enumerate( sorted_twitts[0:]): | |
| print(str(i) + "-", f"{self.key_name}: ", k, " |||", f"{self.key_value}: ", v) | |
| return sorted_twitts | |
| def create_df_counts(self, sorted_twitts): | |
| df_x = pd.DataFrame(sorted_twitts, columns=["label", "count"]) | |
| df_x["index"] = df_x.index | |
| df_x.index = list(df_x["label"]) | |
| df_x = df_x.sort_values(['count'], ascending=False) | |
| return df_x | |
| def get_uniques_and_user_names(self, df): | |
| #user_names = df["commenter"].apply(self.deEmojify) | |
| user_names = df[self.target_column] | |
| #user_names = user_names.str.replace(" ", "") | |
| self.uniques = user_names.unique() | |
| return self.uniques, user_names | |
| def plot_word_count(self, df, titley='Number of Comments', titlex="Names", title="Message frec"): | |
| plt.figure(figsize=(10,6)) | |
| ax = sns.barplot(x="index", y="count", data=df, order=df['index']) | |
| ax.set_title(title, fontdict = self.fontTitle) | |
| ax.set_xlabel(titlex, fontdict = self.fontX) | |
| ax.set_ylabel(titley, fontdict = self.fontX) | |
| # y axis values font size | |
| ax.tick_params(axis='y', labelsize=20) | |
| # Grid | |
| ax.grid(linestyle='--', linewidth=1) | |
| labels = df["count"] | |
| rects = ax.patches | |
| for rect, label in zip(rects, labels): | |
| height = rect.get_height() | |
| ax.text(rect.get_x() + rect.get_width() / 2, height, label, | |
| ha='center', va='bottom', size = 15 ) | |
| ax.set_xticklabels(df['label'], rotation='vertical', fontsize=20) | |
| plt.savefig(self.path_to_save + "/" + self.title + ".jpg", format="jpg", bbox_inches = 'tight') | |
| def create_sorted_counts(self, user_names, top_n=25): | |
| # Count for N | |
| counts = self.get_counts(user_names) | |
| sorted_counts = self.get_sorted_tuits(counts, top_n = top_n) | |
| return sorted_counts | |
| def get_counts(self, user_names): | |
| counts = {} | |
| for u in user_names: | |
| counts[u] = [] | |
| for u in user_names: | |
| counts[u].append(u) | |
| for k,v in counts.items(): | |
| counts[k] = len(v) | |
| return counts | |
| def create_top_N_users(self, sorted_tuits, n=21): | |
| # Word Frecuency for N | |
| df_x = self.create_df_counts(sorted_tuits) | |
| # Plot For N | |
| topN = df_x.iloc[0:n] | |
| return topN | |
| def main(self, df, top_n = 21, target_column = "Departamento", | |
| name="Departamento", | |
| value="Frecuencia", | |
| title_x="Departamento", | |
| title_y="Frecuencia", | |
| title="Acciones por departamento"): | |
| # Set some parameters for the plot | |
| self.target_column = target_column | |
| self.key_name = name | |
| self.key_value = value | |
| self.title_x = title_x | |
| self.title_y = title_y | |
| self.title = title | |
| self.uniques, self.user_names = self.get_uniques_and_user_names(df) | |
| self.sorted_counts = self.create_sorted_counts(self.user_names, top_n=top_n) | |
| self.top20 = self.create_top_N_users(self.sorted_counts, n=top_n) | |
| self.plot_word_count(self.top20, | |
| titley=self.title_y, | |
| titlex=self.title_x, | |
| title=self.title) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment