Skip to content

Instantly share code, notes, and snippets.

@swati210994
Created September 29, 2020 08:16
Show Gist options
  • Select an option

  • Save swati210994/4bae9d27592653d5458231a7dcbbd925 to your computer and use it in GitHub Desktop.

Select an option

Save swati210994/4bae9d27592653d5458231a7dcbbd925 to your computer and use it in GitHub Desktop.
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec,KeyedVectors
from gensim.test.utils import datapath
import re
import glob
from tqdm import tqdm
import gensim
import multiprocessing
import random
#Clean the data function
stopwords_list=stopwords.words('english')
cores=multiprocessing.cpu_count()
def clean_data(w):
w = w.lower() #Lower casing
w=re.sub(r'[^\w\s]','',w) #Remove other special characters
w=re.sub(r"([0-9])", r" ",w) #Remove numbers
words = w.split()
clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2] #Remove stop and short words
return " ".join(clean_words)
#Give input
input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it')
#Initialize the model
model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1)
#Initialize the vocab with two words-- word1 and word2.
model.build_vocab([['word1'],['word2']])
for idx,fname in enumerate(files):
with open(fname,'r') as f: # Reading a .txt file
texts=f.readlines()
for i in tqdm(range(0,len(texts),batch)): # Taking a batch of sentences, if the dataset is very large
cleaned_texts=list(map(clean_data,texts[i:i+batch])) # Cleaning the texts
cleaned_texts = list(map(lambda x: x.split(),cleaned_texts)) # Splitting the sentences to prepare the model input (List of list of tokens in sentnces)
model.build_vocab(cleaned_texts,update=True) # Updating vocab
model.train(cleaned_texts,total_examples=model.corpus_count,epochs=1) # Train the model
print('Current Words in vocabulary: ',len(model.wv.vocab))
if idx%10==0: # Saving the model after each multiple of 10 files (Optional, if too many .txt files are there)
save_model='gensim_model_build_vocab_'+str(idx)+'_files.bin'
model.wv.save_word2vec_format(save_model,binary=True)
print('Model saved as ',save_model)
# Saving the entire model
model.wv.save_word2vec_format('complete_gensim_model_build_vocab_files.bin', binary=True)
#Loading the model during testing time
trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment