swati210994 · September 29, 2020 07:47
diff --git a/w2v_gensim.py b/w2v_gensim.py
 import nltk
 from nltk.corpus import stopwords
 from gensim.models import Word2Vec,KeyedVectors
 from gensim.test.utils import datapath
 import re
 import glob
 from tqdm import tqdm
 import gensim
 import multiprocessing
 import random

 #Clean the data function
 stopwords_list=stopwords.words('english')
 cores=multiprocessing.cpu_count()

 def clean_data(w):
    w = w.lower()                                                                             #Lower casing
    w=re.sub(r'[^\w\s]','',w)                                                                 #Remove other special characters
    w=re.sub(r"([0-9])", r" ",w)                                                              #Remove numbers       
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]  #Remove stop and short words
    return " ".join(clean_words)  

 #Load the input function
 class SentGen(object):
    def __init__(self, dirname):                   #dirname is the path to multiple .txt files
        self.dirname = dirname
    def __iter__(self):
        for idx,fname in enumerate(self.dirname):
            for line in get_sentences(fname):       #Get one line from get_sentences and yield the tokens by line.split()
                yield line.split()

 def get_sentences(fname):
    with open(fname,'r') as f:
            texts=f.readlines()
    sent=list(map(clean_data,texts))               #Clean the sentences
    for line in tqdm(sent):                        #Read one line and yield
      return lines
    
 #Give input
 input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it')
 model_input=SentGen(input_data)

 #Initialize the model
 model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1)

 #Build vocabulary
 model.build_vocab(model_input)                                      #Building the vocabulary using entire dataset
 vocab_len=len(model.wv.vocab)

 #Training the model
 model.train(model_input,total_examples=model.corpus_count,epochs=50)  #Add callbacks, if required

 #Saving the model
 model.wv.save_word2vec_format('gensim_w2v_model.bin',binary=True)

 #Loading the model during testing time
 trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True)
	import nltk
	from nltk.corpus import stopwords
	from gensim.models import Word2Vec,KeyedVectors
	from gensim.test.utils import datapath
	import re
	import glob
	from tqdm import tqdm
	import gensim
	import multiprocessing
	import random

	#Clean the data function
	stopwords_list=stopwords.words('english')
	cores=multiprocessing.cpu_count()

	def clean_data(w):
	w = w.lower() #Lower casing
	w=re.sub(r'[^\w\s]','',w) #Remove other special characters
	w=re.sub(r"([0-9])", r" ",w) #Remove numbers
	words = w.split()
	clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2] #Remove stop and short words
	return " ".join(clean_words)

	#Load the input function
	class SentGen(object):
	def __init__(self, dirname): #dirname is the path to multiple .txt files
	self.dirname = dirname
	def __iter__(self):
	for idx,fname in enumerate(self.dirname):
	for line in get_sentences(fname): #Get one line from get_sentences and yield the tokens by line.split()
	yield line.split()

	def get_sentences(fname):
	with open(fname,'r') as f:
	texts=f.readlines()
	sent=list(map(clean_data,texts)) #Clean the sentences
	for line in tqdm(sent): #Read one line and yield
	return lines

	#Give input
	input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it')
	model_input=SentGen(input_data)

	#Initialize the model
	model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1)

	#Build vocabulary
	model.build_vocab(model_input) #Building the vocabulary using entire dataset
	vocab_len=len(model.wv.vocab)

	#Training the model
	model.train(model_input,total_examples=model.corpus_count,epochs=50) #Add callbacks, if required

	#Saving the model
	model.wv.save_word2vec_format('gensim_w2v_model.bin',binary=True)

	#Loading the model during testing time
	trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True)
No results found