Skip to content

Instantly share code, notes, and snippets.

@estasney
Created May 14, 2018 02:46
Show Gist options
  • Select an option

  • Save estasney/616501ca80126d1df3ba99ca39e90f53 to your computer and use it in GitHub Desktop.

Select an option

Save estasney/616501ca80126d1df3ba99ca39e90f53 to your computer and use it in GitHub Desktop.
Finding Synonyms by Cooccurence
import pandas as pd
import numpy as np
from operator import itemgetter
import flashtext
from fuzzywuzzy import fuzz
from flashtext import KeywordProcessor
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_pickle(r"skill_counts.p")
df = pd.DataFrame.from_dict(list(df.items()))
df = df.rename(columns={0: 'Skill', 1: 'Occur'})
df = df.dropna()
# From large collection of resumes, scan for skills
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(list(set([w.lower() for w in df['Skill'].values.tolist()])))
# Reading in corpus
corp = pd.read_csv(r"corpus.csv")
corp = corp.fillna('')
corp['Skills'] = corp['Job Descriptions'].apply(lambda x: keyword_processor.extract_keywords(x))
# Joining list for CountVectorizer
docs = corp['Skills'].apply(lambda x: " ".join(x))
# https://stackoverflow.com/questions/35562789/word-word-co-occurrence-matrix/37822989#37822989
count_model = CountVectorizer(ngram_range=(1,1))
X = count_model.fit_transform(docs)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0)
# Store matrix in DataFrame
occur = pd.DataFrame(Xc.todense())
# Feature Names
features = {i: v for i, v in zip(np.arange(len(count_model.get_feature_names())), count_model.get_feature_names())}
occur = occur.rename(columns=features)
occur = occur.rename(index=features)
occur.to_pickle('kw_matrix.pkl')
def slice_array(array, pos):
"""
Returns:
- element in array indexed by pos
- array with element sliced out
"""
if pos == 0:
this = array[pos]
others = array[1:]
elif (len(array) - 1) > pos >= 1:
this = array[pos]
others_a = array[:pos]
others_b = array[(pos + 1):]
others = list(others_a) + list(others_b)
others = np.array(others)
elif pos == (len(array) - 1):
this = array[pos]
others = array[:pos]
return this, others
def eval_pair(term_one, term_two, method):
"""
Use FuzzyWuzzy to compute ratio
"""
if method == 'ratio':
return (term_two, fuzz.ratio(term_one, term_two))
elif method == 'partial':
return (term_two, fuzz.partial_ratio(term_one, term_two))
elif method == 'token_sort':
return (term_two, fuzz.token_sort_ratio(term_one, term_two))
elif method == 'token_set':
return (term_two, fuzz.token_set_ratio(term_one, term_two))
def eval_all_pairs(term, method, array):
"""
Compute ratio for term and remaining elements in array
"""
term_pos = np.searchsorted(array, term)
term, others = slice_array(array, term_pos)
runs = [eval_pair(term, x, method) for x in others]
runs = sorted(runs, key=itemgetter(1), reverse=True)
return runs
def filter_by_ranks(term, method, array):
"""
Filter values by pandas rank
"""
scores = eval_all_pairs(term, method, array)
tdf = pd.DataFrame(scores)
tdf['rank'] = tdf[1].rank(ascending=False, method='dense')
tops = tdf.loc[tdf['rank']<=2]
if tops.empty:
return None
else:
return tops[0].values.tolist()
term_syn = pd.DataFrame(occur.index.values)
term_syn = term_syn.rename(columns={0: 'term'})
term_syn['syns'] = term_syn['term'].apply(lambda x: filter_by_ranks(x, 'ratio', term_syn['term'].values))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment