swati210994 · September 29, 2020 08:16
diff --git a/w2v_buildloop_vocab.py b/w2v_buildloop_vocab.py
 import nltk
 from nltk.corpus import stopwords
 from gensim.models import Word2Vec,KeyedVectors
 from gensim.test.utils import datapath
 import re
 import glob
 from tqdm import tqdm
 import gensim
 import multiprocessing
 import random

 #Clean the data function
 stopwords_list=stopwords.words('english')
 cores=multiprocessing.cpu_count()

 def clean_data(w):
    w = w.lower()                                                                             #Lower casing
    w=re.sub(r'[^\w\s]','',w)                                                                 #Remove other special characters
    w=re.sub(r"([0-9])", r" ",w)                                                              #Remove numbers       
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]  #Remove stop and short words
    return " ".join(clean_words)  

 #Give input
 input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it')

 #Initialize the model
 model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1)

 #Initialize the vocab with two words-- word1 and word2.
 model.build_vocab([['word1'],['word2']])

 for idx,fname in enumerate(files):
    with open(fname,'r') as f:                                                  # Reading a .txt file
        texts=f.readlines()
    for i in tqdm(range(0,len(texts),batch)):                                   # Taking a batch of sentences, if the dataset is very large
        cleaned_texts=list(map(clean_data,texts[i:i+batch]))                    # Cleaning the texts
        cleaned_texts = list(map(lambda x: x.split(),cleaned_texts))            # Splitting the sentences to prepare the model input (List of list of tokens in sentnces)
        model.build_vocab(cleaned_texts,update=True)                            # Updating vocab
        model.train(cleaned_texts,total_examples=model.corpus_count,epochs=1)   # Train the model
        print('Current Words in vocabulary: ',len(model.wv.vocab))
         
    if idx%10==0:                                                               # Saving the model after each multiple of 10 files (Optional, if too many .txt files are there)       
        save_model='gensim_model_build_vocab_'+str(idx)+'_files.bin'
        model.wv.save_word2vec_format(save_model,binary=True)
        print('Model saved as ',save_model)

 # Saving the entire model 
 model.wv.save_word2vec_format('complete_gensim_model_build_vocab_files.bin', binary=True)   

 #Loading the model during testing time
 trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True)
	import nltk
	from nltk.corpus import stopwords
	from gensim.models import Word2Vec,KeyedVectors
	from gensim.test.utils import datapath
	import re
	import glob
	from tqdm import tqdm
	import gensim
	import multiprocessing
	import random

	#Clean the data function
	stopwords_list=stopwords.words('english')
	cores=multiprocessing.cpu_count()

	def clean_data(w):
	w = w.lower() #Lower casing
	w=re.sub(r'[^\w\s]','',w) #Remove other special characters
	w=re.sub(r"([0-9])", r" ",w) #Remove numbers
	words = w.split()
	clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2] #Remove stop and short words
	return " ".join(clean_words)

	#Give input
	input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it')

	#Initialize the model
	model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1)

	#Initialize the vocab with two words-- word1 and word2.
	model.build_vocab([['word1'],['word2']])

	for idx,fname in enumerate(files):
	with open(fname,'r') as f: # Reading a .txt file
	texts=f.readlines()
	for i in tqdm(range(0,len(texts),batch)): # Taking a batch of sentences, if the dataset is very large
	cleaned_texts=list(map(clean_data,texts[i:i+batch])) # Cleaning the texts
	cleaned_texts = list(map(lambda x: x.split(),cleaned_texts)) # Splitting the sentences to prepare the model input (List of list of tokens in sentnces)
	model.build_vocab(cleaned_texts,update=True) # Updating vocab
	model.train(cleaned_texts,total_examples=model.corpus_count,epochs=1) # Train the model
	print('Current Words in vocabulary: ',len(model.wv.vocab))

	if idx%10==0: # Saving the model after each multiple of 10 files (Optional, if too many .txt files are there)
	save_model='gensim_model_build_vocab_'+str(idx)+'_files.bin'
	model.wv.save_word2vec_format(save_model,binary=True)
	print('Model saved as ',save_model)

	# Saving the entire model
	model.wv.save_word2vec_format('complete_gensim_model_build_vocab_files.bin', binary=True)

	#Loading the model during testing time
	trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True)
No results found