Created
September 29, 2020 08:16
-
-
Save swati210994/4bae9d27592653d5458231a7dcbbd925 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| from nltk.corpus import stopwords | |
| from gensim.models import Word2Vec,KeyedVectors | |
| from gensim.test.utils import datapath | |
| import re | |
| import glob | |
| from tqdm import tqdm | |
| import gensim | |
| import multiprocessing | |
| import random | |
| #Clean the data function | |
| stopwords_list=stopwords.words('english') | |
| cores=multiprocessing.cpu_count() | |
| def clean_data(w): | |
| w = w.lower() #Lower casing | |
| w=re.sub(r'[^\w\s]','',w) #Remove other special characters | |
| w=re.sub(r"([0-9])", r" ",w) #Remove numbers | |
| words = w.split() | |
| clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2] #Remove stop and short words | |
| return " ".join(clean_words) | |
| #Give input | |
| input_data=glob.glob('path_to_folder_with_multiple_txt_files_inside_it') | |
| #Initialize the model | |
| model = Word2Vec(min_count=1,window=3,size=300,workers=cores-1) | |
| #Initialize the vocab with two words-- word1 and word2. | |
| model.build_vocab([['word1'],['word2']]) | |
| for idx,fname in enumerate(files): | |
| with open(fname,'r') as f: # Reading a .txt file | |
| texts=f.readlines() | |
| for i in tqdm(range(0,len(texts),batch)): # Taking a batch of sentences, if the dataset is very large | |
| cleaned_texts=list(map(clean_data,texts[i:i+batch])) # Cleaning the texts | |
| cleaned_texts = list(map(lambda x: x.split(),cleaned_texts)) # Splitting the sentences to prepare the model input (List of list of tokens in sentnces) | |
| model.build_vocab(cleaned_texts,update=True) # Updating vocab | |
| model.train(cleaned_texts,total_examples=model.corpus_count,epochs=1) # Train the model | |
| print('Current Words in vocabulary: ',len(model.wv.vocab)) | |
| if idx%10==0: # Saving the model after each multiple of 10 files (Optional, if too many .txt files are there) | |
| save_model='gensim_model_build_vocab_'+str(idx)+'_files.bin' | |
| model.wv.save_word2vec_format(save_model,binary=True) | |
| print('Model saved as ',save_model) | |
| # Saving the entire model | |
| model.wv.save_word2vec_format('complete_gensim_model_build_vocab_files.bin', binary=True) | |
| #Loading the model during testing time | |
| trained_model= KeyedVectors.load_word2vec_format('gensim_w2v_model.bin', binary=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment