Created
May 14, 2018 02:46
-
-
Save estasney/616501ca80126d1df3ba99ca39e90f53 to your computer and use it in GitHub Desktop.
Finding Synonyms by Cooccurence
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| from operator import itemgetter | |
| import flashtext | |
| from fuzzywuzzy import fuzz | |
| from flashtext import KeywordProcessor | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| df = pd.read_pickle(r"skill_counts.p") | |
| df = pd.DataFrame.from_dict(list(df.items())) | |
| df = df.rename(columns={0: 'Skill', 1: 'Occur'}) | |
| df = df.dropna() | |
| # From large collection of resumes, scan for skills | |
| keyword_processor = KeywordProcessor() | |
| keyword_processor.add_keywords_from_list(list(set([w.lower() for w in df['Skill'].values.tolist()]))) | |
| # Reading in corpus | |
| corp = pd.read_csv(r"corpus.csv") | |
| corp = corp.fillna('') | |
| corp['Skills'] = corp['Job Descriptions'].apply(lambda x: keyword_processor.extract_keywords(x)) | |
| # Joining list for CountVectorizer | |
| docs = corp['Skills'].apply(lambda x: " ".join(x)) | |
| # https://stackoverflow.com/questions/35562789/word-word-co-occurrence-matrix/37822989#37822989 | |
| count_model = CountVectorizer(ngram_range=(1,1)) | |
| X = count_model.fit_transform(docs) | |
| Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format | |
| Xc.setdiag(0) | |
| # Store matrix in DataFrame | |
| occur = pd.DataFrame(Xc.todense()) | |
| # Feature Names | |
| features = {i: v for i, v in zip(np.arange(len(count_model.get_feature_names())), count_model.get_feature_names())} | |
| occur = occur.rename(columns=features) | |
| occur = occur.rename(index=features) | |
| occur.to_pickle('kw_matrix.pkl') | |
| def slice_array(array, pos): | |
| """ | |
| Returns: | |
| - element in array indexed by pos | |
| - array with element sliced out | |
| """ | |
| if pos == 0: | |
| this = array[pos] | |
| others = array[1:] | |
| elif (len(array) - 1) > pos >= 1: | |
| this = array[pos] | |
| others_a = array[:pos] | |
| others_b = array[(pos + 1):] | |
| others = list(others_a) + list(others_b) | |
| others = np.array(others) | |
| elif pos == (len(array) - 1): | |
| this = array[pos] | |
| others = array[:pos] | |
| return this, others | |
| def eval_pair(term_one, term_two, method): | |
| """ | |
| Use FuzzyWuzzy to compute ratio | |
| """ | |
| if method == 'ratio': | |
| return (term_two, fuzz.ratio(term_one, term_two)) | |
| elif method == 'partial': | |
| return (term_two, fuzz.partial_ratio(term_one, term_two)) | |
| elif method == 'token_sort': | |
| return (term_two, fuzz.token_sort_ratio(term_one, term_two)) | |
| elif method == 'token_set': | |
| return (term_two, fuzz.token_set_ratio(term_one, term_two)) | |
| def eval_all_pairs(term, method, array): | |
| """ | |
| Compute ratio for term and remaining elements in array | |
| """ | |
| term_pos = np.searchsorted(array, term) | |
| term, others = slice_array(array, term_pos) | |
| runs = [eval_pair(term, x, method) for x in others] | |
| runs = sorted(runs, key=itemgetter(1), reverse=True) | |
| return runs | |
| def filter_by_ranks(term, method, array): | |
| """ | |
| Filter values by pandas rank | |
| """ | |
| scores = eval_all_pairs(term, method, array) | |
| tdf = pd.DataFrame(scores) | |
| tdf['rank'] = tdf[1].rank(ascending=False, method='dense') | |
| tops = tdf.loc[tdf['rank']<=2] | |
| if tops.empty: | |
| return None | |
| else: | |
| return tops[0].values.tolist() | |
| term_syn = pd.DataFrame(occur.index.values) | |
| term_syn = term_syn.rename(columns={0: 'term'}) | |
| term_syn['syns'] = term_syn['term'].apply(lambda x: filter_by_ranks(x, 'ratio', term_syn['term'].values)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment