Skip to content

Instantly share code, notes, and snippets.

@Venkatstatistics
Last active August 17, 2019 19:30
Show Gist options
  • Save Venkatstatistics/6038622e08fc3bf68c55b8ab5079edd6 to your computer and use it in GitHub Desktop.
Save Venkatstatistics/6038622e08fc3bf68c55b8ab5079edd6 to your computer and use it in GitHub Desktop.
import time
start = time.time()
import csv
import sys
import pandas as pd
import numpy as np
from operator import itemgetter
import redis
REDIS_URL = "redis://localhost:6379/0"
r = redis.Redis(host='localhost', port=6379, db=0)
SIMILAR_COUNT = 3 #to set the number of similar words you want for each word
def read_biglist():
biglist = pd.read_csv("big_Keywords.csv")
bigwords = biglist.keyword.tolist()
for token1 in bigwords:
r.lpush("big_keywords", token1)
def process():
import en_vectors_web_lg
nlp = en_vectors_web_lg.load()
topicdf = pd.read_csv("small_Topics.csv", encoding='Latin-1')
topics = topicdf.Topic.tolist()
while True:
big_keyword = r.lpop('big_keywords').decode('utf-8')
if not big_keyword:
break
key = 'keyword_score###{}'.format(big_keyword)
for topic in topics:
score = nlp(str(big_keyword)).similarity(nlp(str(topic)))
elem = "{}###{}###{}".format(big_keyword, topic, score)
r.zadd(key, {elem: score})
n_result = r.zrevrangebyscore(key, "+inf", "-inf", start=0, num=SIMILAR_COUNT)
result = [big_keyword] + [r.decode('utf-8').split('###')[1] for r in n_result]
r.lpush('results', ','.join(result))
r.delete(key)
print(','.join(result))
def dump():
with open('results.csv', 'w') as f:
for key in r.lrange('results', 0, -1):
print(key)
f.write(key.decode('utf-8'))
f.write('\n')
if __name__== "__main__":
func_name = sys.argv[1]
functions = {
'read_biglist': read_biglist,
'process': process,
'dump': dump,
}
f = functions.get(func_name)
f()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment