estasney · May 14, 2018 02:46
diff --git a/synonyms.py b/synonyms.py
 import pandas as pd
 import numpy as np
 from operator import itemgetter
 import flashtext
 from fuzzywuzzy import fuzz
 from flashtext import KeywordProcessor
 from sklearn.feature_extraction.text import CountVectorizer

 df = pd.read_pickle(r"skill_counts.p")
 df = pd.DataFrame.from_dict(list(df.items()))
 df = df.rename(columns={0: 'Skill', 1: 'Occur'})
 df = df.dropna()

 # From large collection of resumes, scan for skills
 keyword_processor = KeywordProcessor()
 keyword_processor.add_keywords_from_list(list(set([w.lower() for w in df['Skill'].values.tolist()])))

 # Reading in corpus
 corp = pd.read_csv(r"corpus.csv")
 corp = corp.fillna('')
 corp['Skills'] = corp['Job Descriptions'].apply(lambda x: keyword_processor.extract_keywords(x))

 # Joining list for CountVectorizer
 docs = corp['Skills'].apply(lambda x: " ".join(x))

 # https://stackoverflow.com/questions/35562789/word-word-co-occurrence-matrix/37822989#37822989
 count_model = CountVectorizer(ngram_range=(1,1))
 X = count_model.fit_transform(docs)
 Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
 Xc.setdiag(0)

 # Store matrix in DataFrame
 occur = pd.DataFrame(Xc.todense())

 # Feature Names
 features = {i: v for i, v in zip(np.arange(len(count_model.get_feature_names())), count_model.get_feature_names())}
 occur = occur.rename(columns=features)
 occur = occur.rename(index=features)
 occur.to_pickle('kw_matrix.pkl')

 def slice_array(array, pos):
  """
  Returns:
    - element in array indexed by pos
    - array with element sliced out
  """
    if pos == 0:
        this = array[pos]
        others = array[1:]
    elif (len(array) - 1) > pos >= 1:
        this = array[pos]
        others_a = array[:pos]
        others_b = array[(pos + 1):]
        others = list(others_a) + list(others_b)
        others = np.array(others)
    elif pos == (len(array) - 1):
        this = array[pos]
        others = array[:pos]
    return this, others
    
 def eval_pair(term_one, term_two, method):
  """
  Use FuzzyWuzzy to compute ratio
  """
    if method == 'ratio':
        return (term_two, fuzz.ratio(term_one, term_two))
    elif method == 'partial':
        return (term_two, fuzz.partial_ratio(term_one, term_two))
    elif method == 'token_sort':
        return (term_two, fuzz.token_sort_ratio(term_one, term_two))
    elif method == 'token_set':
        return (term_two, fuzz.token_set_ratio(term_one, term_two))
    
 def eval_all_pairs(term, method, array):
  """
  Compute ratio for term and remaining elements in array
  """
    term_pos = np.searchsorted(array, term)
    term, others = slice_array(array, term_pos)
    runs = [eval_pair(term, x, method) for x in others]
    runs = sorted(runs, key=itemgetter(1), reverse=True)
    return runs

 def filter_by_ranks(term, method, array):
  """
  Filter values by pandas rank
  """
    scores = eval_all_pairs(term, method, array)
    tdf = pd.DataFrame(scores)
    tdf['rank'] = tdf[1].rank(ascending=False, method='dense')
    tops = tdf.loc[tdf['rank']<=2]
    if tops.empty:
        return None
    else:
        return tops[0].values.tolist()

 term_syn = pd.DataFrame(occur.index.values)
 term_syn = term_syn.rename(columns={0: 'term'})
 term_syn['syns'] = term_syn['term'].apply(lambda x: filter_by_ranks(x, 'ratio', term_syn['term'].values))
	import pandas as pd
	import numpy as np
	from operator import itemgetter
	import flashtext
	from fuzzywuzzy import fuzz
	from flashtext import KeywordProcessor
	from sklearn.feature_extraction.text import CountVectorizer

	df = pd.read_pickle(r"skill_counts.p")
	df = pd.DataFrame.from_dict(list(df.items()))
	df = df.rename(columns={0: 'Skill', 1: 'Occur'})
	df = df.dropna()

	# From large collection of resumes, scan for skills
	keyword_processor = KeywordProcessor()
	keyword_processor.add_keywords_from_list(list(set([w.lower() for w in df['Skill'].values.tolist()])))

	# Reading in corpus
	corp = pd.read_csv(r"corpus.csv")
	corp = corp.fillna('')
	corp['Skills'] = corp['Job Descriptions'].apply(lambda x: keyword_processor.extract_keywords(x))

	# Joining list for CountVectorizer
	docs = corp['Skills'].apply(lambda x: " ".join(x))

	# https://stackoverflow.com/questions/35562789/word-word-co-occurrence-matrix/37822989#37822989
	count_model = CountVectorizer(ngram_range=(1,1))
	X = count_model.fit_transform(docs)
	Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
	Xc.setdiag(0)

	# Store matrix in DataFrame
	occur = pd.DataFrame(Xc.todense())

	# Feature Names
	features = {i: v for i, v in zip(np.arange(len(count_model.get_feature_names())), count_model.get_feature_names())}
	occur = occur.rename(columns=features)
	occur = occur.rename(index=features)
	occur.to_pickle('kw_matrix.pkl')

	def slice_array(array, pos):
	"""
	Returns:
	- element in array indexed by pos
	- array with element sliced out
	"""
	if pos == 0:
	this = array[pos]
	others = array[1:]
	elif (len(array) - 1) > pos >= 1:
	this = array[pos]
	others_a = array[:pos]
	others_b = array[(pos + 1):]
	others = list(others_a) + list(others_b)
	others = np.array(others)
	elif pos == (len(array) - 1):
	this = array[pos]
	others = array[:pos]
	return this, others

	def eval_pair(term_one, term_two, method):
	"""
	Use FuzzyWuzzy to compute ratio
	"""
	if method == 'ratio':
	return (term_two, fuzz.ratio(term_one, term_two))
	elif method == 'partial':
	return (term_two, fuzz.partial_ratio(term_one, term_two))
	elif method == 'token_sort':
	return (term_two, fuzz.token_sort_ratio(term_one, term_two))
	elif method == 'token_set':
	return (term_two, fuzz.token_set_ratio(term_one, term_two))

	def eval_all_pairs(term, method, array):
	"""
	Compute ratio for term and remaining elements in array
	"""
	term_pos = np.searchsorted(array, term)
	term, others = slice_array(array, term_pos)
	runs = [eval_pair(term, x, method) for x in others]
	runs = sorted(runs, key=itemgetter(1), reverse=True)
	return runs

	def filter_by_ranks(term, method, array):
	"""
	Filter values by pandas rank
	"""
	scores = eval_all_pairs(term, method, array)
	tdf = pd.DataFrame(scores)
	tdf['rank'] = tdf[1].rank(ascending=False, method='dense')
	tops = tdf.loc[tdf['rank']<=2]
	if tops.empty:
	return None
	else:
	return tops[0].values.tolist()

	term_syn = pd.DataFrame(occur.index.values)
	term_syn = term_syn.rename(columns={0: 'term'})
	term_syn['syns'] = term_syn['term'].apply(lambda x: filter_by_ranks(x, 'ratio', term_syn['term'].values))
No results found