Created
August 16, 2021 12:49
-
-
Save ivopbernardo/38b7b4f2a41333426add8bf39e1c9994 to your computer and use it in GitHub Desktop.
word_vectors_cooccurrence
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import wikipedia | |
import pandas as pd | |
import numpy as np | |
import string | |
from nltk.tokenize import word_tokenize | |
from sklearn.metrics.pairwise import cosine_similarity | |
def retrieve_page(page_name: str) -> list: | |
''' | |
Retrieves page data from wikipedia | |
and stores words in lower case format in | |
a list - tokenized format. | |
''' | |
usa_article = wikipedia.page(page_name) | |
# Strip puncuation from page | |
usa_article = ( | |
usa_article.content.translate(str.maketrans('', '', string.punctuation)) | |
) | |
# Lower text case | |
usa_article = usa_article.lower() | |
# Tokenize using NLTK word tokenizer | |
usa_article_token = word_tokenize(usa_article) | |
return usa_article_token | |
def build_vocabulary(page:list) -> list: | |
''' | |
Builds vocabulary with all the words | |
present in the list page. | |
''' | |
vocab = list(set(page)) | |
vocab.sort() | |
vocab_dict = {} | |
for index, word in enumerate(vocab): | |
vocab_dict[word] = index | |
return vocab_dict | |
def build_context( | |
page:str, | |
co_occurrence_vectors: pd.DataFrame | |
) -> pd.DataFrame: | |
''' | |
Updates co-ocurrence vectors based on | |
text read from the page. | |
''' | |
for index, element in enumerate(page): | |
# Build start and finish of context | |
start = 0 if index-2 < 0 else index-2 | |
finish = len(page) if index+2 > len(page) else index+3 | |
# Retrieve Context for word | |
context = page[start:index]+page[index+1:finish] | |
for word in context: | |
# Update Co-Occurrence Matrix | |
co_occurrence_vectors.loc[element, word] = ( | |
co_occurrence_vectors.loc[element, word]+1 | |
) | |
return co_occurrence_vectors | |
usa_article_token = retrieve_page('United States of America') | |
vocab_dict = build_vocabulary(usa_article_token) | |
co_occurrence_vectors = pd.DataFrame( | |
np.zeros([len(vocab_dict), len(vocab_dict)]), | |
index = vocab_dict.keys(), | |
columns = vocab_dict.keys() | |
) | |
co_occurrence_vectors = build_context( | |
usa_article_token, | |
co_occurrence_vectors | |
) | |
similarity_words = pd.DataFrame( | |
cosine_similarity(co_occurrence_vectors), | |
columns = vocab_dict.keys(), | |
index = vocab_dict.keys() | |
) | |
# Example of Top 10 words by similarity | |
similarity_words.loc['china'].sort_values(ascending=False).head(10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment