Last active
May 30, 2020 20:12
-
-
Save ecdedios/de48523d5b65b5a591b08d3d7edc3ba8 to your computer and use it in GitHub Desktop.
Basic cleaning and n-gram helper functions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean(text): | |
""" | |
A simple function to clean up the data. All the words that | |
are not designated as a stop word is then lemmatized after | |
encoding and basic regex parsing are performed. | |
""" | |
wnl = nltk.stem.WordNetLemmatizer() | |
stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS | |
text = (unicodedata.normalize('NFKD', text) | |
.encode('ascii', 'ignore') | |
.decode('utf-8', 'ignore') | |
.lower()) | |
words = re.sub(r'[^\w\s]', '', text).split() | |
return [wnl.lemmatize(word) for word in words if word not in stopwords] | |
def get_words(df, column): | |
""" | |
Takes in a dataframe and columns and returns a list of | |
words from the values in the specified column. | |
""" | |
return clean(''.join(str(df[column].tolist()))) | |
def get_bigrams(df, column): | |
""" | |
Takes in a list of words and returns a series of | |
bigrams with value counts. | |
""" | |
return (pd.Series(nltk.ngrams(get_words(df, column), 2)).value_counts())[:10] | |
def get_trigrams(df, column): | |
""" | |
Takes in a list of words and returns a series of | |
trigrams with value counts. | |
""" | |
return (pd.Series(nltk.ngrams(get_words(df, column), 3)).value_counts())[:10] | |
def viz_bigrams(df ,column): | |
get_bigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8)) | |
plt.title('20 Most Frequently Occuring Bigrams') | |
plt.ylabel('Bigram') | |
plt.xlabel('# Occurances') | |
def viz_trigrams(df, column): | |
get_trigrams(df, column).sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8)) | |
plt.title('20 Most Frequently Occuring Trigrams') | |
plt.ylabel('Trigram') | |
plt.xlabel('# Occurances') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment