Created
July 6, 2020 10:42
-
-
Save jcdevilleres/c46efaa91b5d80fa82148b2ca169adbb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
plt.rcParams['figure.figsize'] = [12, 10] # Set out plot size | |
# Import required libraries, including gensim, matplotlib, and pandas | |
import pprint | |
import sys | |
from collections import defaultdict | |
import gensim | |
from gensim import corpora | |
from gensim import models | |
from gensim import similarities | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
## Load text corpus | |
# Helper function to load text file containing duties / responsibilities | |
# Note that each line is a separate document of job description | |
def open_cor(filename): | |
text_corpus = [] | |
for line in open(filename, "r"): | |
text_corpus.append(line) | |
return text_corpus | |
filename ="duties.txt" | |
text_corpus = open_cor(filename) | |
# Create a set of frequent words, this will be removed from our processed text | |
stoplist = set('for a of the and to in & or be with by that'.split(' ')) | |
# Lowercase each document, split it by white space and filter out stopwords | |
texts = [[word for word in document.lower().split() if word not in stoplist] | |
for document in text_corpus] | |
# Count word frequencies | |
frequency = defaultdict(int) # Create defaultdict class with each word and their frequency | |
for text in texts: # Iterate through each text document from our texts corpus | |
for token in text: | |
frequency[token] += 1 | |
# Only keep words that appear more than once | |
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts] | |
pprint.pprint(processed_corpus) | |
# Saved processed corpus into our corpora.Dictionary object this is our most important object | |
# it contains the tokens as well as their frequencies | |
dictionary = corpora.Dictionary(processed_corpus) | |
print(dictionary) | |
# The token2id attribute returns dictionary of our tokens and their ids | |
pprint.pprint(dictionary.token2id) | |
# The cfs attribute returns ow many instances of this token are contained in the documents. | |
dictionary.dfs | |
# Save output dictionary into text file for later use | |
dictionary.save_as_text("dict_text.txt") | |
## Comparison of new document with corpus | |
# We can convert our entire original corpus to a list of vectors: | |
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] | |
pprint.pprint(bow_corpus) | |
## Training our model | |
# train the model using our bag-of-words corpus from original corpus | |
tfidf = models.TfidfModel(bow_corpus) | |
# We are initializing our 'SparseMatrixSimilarity' which will be used to compute cosine similarity of document against corpus | |
# Note that 'num_features' must be updated to same size as dictionary or len(dictionary). | |
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary)) | |
# I spent quite some time troubleshooting this parameter and do not want you to waste your time as well! | |
# We can insert our job description details as new_doc. This is the document to compare with corpus | |
query_document = 'responsibilities collecting and interpreting data analyzing results reporting the results back to the relevant members of the business identifying patterns and trends in data sets working alongside teams within the business or the management team to establish business needs defining new data collection and analysis processes controlling existing database processing weekly and monthly reports of sm and websites accounts developing records management processes and policies identifying areas to increase efficiency and automation of processes setting up and maintain automated data processes identifying, evaluating and implementing external services and tools to support data validation and cleansing producing and track key performance indicators developing and support reporting processes monitoring and auditing data quality liaising with internal and external clients to fully understand data content gathering, understanding and documenting detailed business requirements using appropriate tools and techniques designing and carrying out surveys and analysing survey data manipulating, analysing and interpreting complex data sets relating to the employers business preparing reports for internal and external audiences using business analytics reporting tools creating data dashboards, graphs and visualisations providing sector and competitor benchmarking in market research mining and analysing large datasets, drawing valid inferences and presenting them successfully to management using a reporting tool. processing weekly and monthly reports of sm and websites accounts.'.split() | |
# Query our new document with our corpus and model to find how 'similar' it is to each of our documents in the corpus | |
query_bow = dictionary.doc2bow(query_document) | |
tfidf[query_bow] | |
sims = index[tfidf[query_bow]] | |
# Similarity values near '1' means it is very. The greater the values, mean more similarity. | |
print(list(enumerate(sims))) | |
# Same output but sorted according to score. | |
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True): | |
print(document_number, score) | |
## Visualization | |
# Load previous saved text file of our dictionary | |
data = pd.read_csv('dict_text.txt', sep="\t", header=None, skiprows = 1) | |
data.columns = ["token_id", "token_name", "token_frequency"] | |
data.head() | |
# Display tokens sorted according to frequency | |
data.sort_values(by='token_frequency', ascending=False) | |
# Plot tokens and their frequency | |
sorted_data = data.sort_values(by='token_frequency', ascending=True) | |
plt.barh(sorted_data.token_name, sorted_data.token_frequency) | |
# Plot tokens which appear more than once | |
filtered_data = sorted_data[sorted_data.token_frequency > 1] | |
plt.barh(filtered_data.token_name, filtered_data.token_frequency) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment