Last active March 20, 2019 18:55
Code to Run the doc2vec conversion

Taken from Gensim Tutorial

Steps to run the code -

  • Download the attachment
  • Make sure python is installed
  • Type pip install gensim nltk
  • If any library is missing, install them as pip install <library_name>
  • Run python <input folder>
  • Here the input folder is the folder that contains the case files
  • You can provide the input folder by navigating into the input folder using terminal, and type pwd. This provides the path of the input folder
  • The model will train itself, and after it is finished, run the python
  • You will be asked to enter the sample text for which the relevant documents would be returned.
# This code checks the summarizer for a given input
from gensim.models.doc2vec import Doc2Vec
import sys
import os
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
model= Doc2Vec.load("d2v.model")
text = input("Please enter the sentence that you want a relevant document for!")
#to find the vector of a document which is not in training data
test_data = word_tokenize(text.lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)
# Stores the list of files in a given folder
file_list = list()
# Name of input folder
folder_name = sys.argv[1]
for file in os.listdir(folder_name):
# to find most similar doc using the given sentence
similar_doc = model.docvecs.most_similar(positive=[model.infer_vector(test_data)],topn=5)
print('Corresponding filenames for a given tag is - ')
for val in similar_doc:
print('Tag{0} - File{1}'.format(val[0],file_list[int(val[0])]))
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
import os
import sys
from nltk.tokenize import word_tokenize
import pdb
# Stores the list of files in a given folder
file_list = list()
# Name of input folder
folder_name = sys.argv[1]
for file in os.listdir(folder_name):
#Stores the given text in total_text
PATH = folder_name
total_text = list()
for i,file in enumerate(file_list):
if i<=100:
with open(PATH+file,'r') as file:
text = file.readlines()
text = text[6:]
for i,item in enumerate(total_text):
# Runs the summarizer
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
data = total_text
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
# train the model
max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(size=vec_size,
dm =1)
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha"d2222v.model")
print("Model Saved")
