Skip to content

Instantly share code, notes, and snippets.

@swati210994
Created September 29, 2020 07:47
Show Gist options
  • Select an option

  • Save swati210994/42f450aa06db5ba9d34f4724992f158f to your computer and use it in GitHub Desktop.

Select an option

Save swati210994/42f450aa06db5ba9d34f4724992f158f to your computer and use it in GitHub Desktop.
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec,KeyedVectors
from gensim.test.utils import datapath
import re
import glob
from tqdm import tqdm
import gensim
import multiprocessing
import random
#Clean the data function
stopwords_list=stopwords.words('english')
cores=multiprocessing.cpu_count()
def clean_data(w):
w = w.lower() #Lower casing
w=re.sub(r'[^\w\s]','',w) #Remove other special characters
w=re.sub(r"([0-9])", r" ",w) #Remove numbers
words = w.split()
clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2] #Remove stop and short words
return " ".join(clean_words)
#Load the input function
class SentGen(object):
def __init__(self, dirname): #dirname is the path to multiple .txt files
self.dirname = dirname
def __iter__(self):
for idx,fname in enumerate(self.dirname):
for line in get_sentences(fname): #Get one line from get_sentences and yield the tokens by line.split()
yield line.split()
def get_sentences(fname):
with open(fname,'r') as f:
texts=f.readlines()
sent=list(map(clean_data,texts)) #Clean the sentences
for line in tqdm(sent): #Read one line and yield
return lines
#Give input
input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it')
model_input=SentGen(input_data)
#Initialize the model
model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1)
#Build vocabulary
model.build_vocab(model_input) #Building the vocabulary using entire dataset
vocab_len=len(model.wv.vocab)
#Training the model
model.train(model_input,total_examples=model.corpus_count,epochs=50) #Add callbacks, if required
#Saving the model
model.wv.save_word2vec_format('gensim_w2v_model.bin',binary=True)
#Loading the model during testing time
trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment