Skip to content

Instantly share code, notes, and snippets.

@yashrsharma44
Last active March 20, 2019 18:55
Show Gist options
  • Save yashrsharma44/eec3f875d550d64a589b70c6c561226d to your computer and use it in GitHub Desktop.
Save yashrsharma44/eec3f875d550d64a589b70c6c561226d to your computer and use it in GitHub Desktop.
Code to Run the doc2vec conversion

Code to run the doc2vec conversion

Taken from Gensim Tutorial

Steps to run the code -

  • Download the attachment
  • Make sure python is installed
  • Type pip install gensim nltk
  • If any library is missing, install them as pip install <library_name>
  • Run python code_runner.py <input folder>
  • Here the input folder is the folder that contains the case files
  • You can provide the input folder by navigating into the input folder using terminal, and type pwd. This provides the path of the input folder
  • The model will train itself, and after it is finished, run the python code_checker.py
  • You will be asked to enter the sample text for which the relevant documents would be returned.
# This code checks the summarizer for a given input
from gensim.models.doc2vec import Doc2Vec
import sys
import os
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
model= Doc2Vec.load("d2v.model")
text = input("Please enter the sentence that you want a relevant document for!")
#to find the vector of a document which is not in training data
test_data = word_tokenize(text.lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)
# Stores the list of files in a given folder
file_list = list()
# Name of input folder
folder_name = sys.argv[1]
for file in os.listdir(folder_name):
file_list.append(file)
# to find most similar doc using the given sentence
similar_doc = model.docvecs.most_similar(positive=[model.infer_vector(test_data)],topn=5)
print(similar_doc)
print('Corresponding filenames for a given tag is - ')
for val in similar_doc:
print('Tag{0} - File{1}'.format(val[0],file_list[int(val[0])]))
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])
import os
import sys
from nltk.tokenize import word_tokenize
import pdb
# Stores the list of files in a given folder
file_list = list()
# Name of input folder
folder_name = sys.argv[1]
for file in os.listdir(folder_name):
file_list.append(file)
#Stores the given text in total_text
PATH = folder_name
total_text = list()
for i,file in enumerate(file_list):
if i<=100:
with open(PATH+file,'r') as file:
text = file.readlines()
text = text[6:]
total_text.append(text)
else:
break
for i,item in enumerate(total_text):
total_text[i]=''.join(item)
# Runs the summarizer
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
data = total_text
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
# train the model
max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(size=vec_size,
alpha=alpha,
min_alpha=0.00025,
min_count=1,
dm =1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save("d2222v.model")
print("Model Saved")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment