getgimphed · May 18, 2020 20:34
diff --git a/CreatingDataSet.py b/CreatingDataSet.py
 # Importing the libraries
 import docx2txt
 import pickle
 import re 

 # TH : The Hindu, TOI : Times of India, IE : Indian Express, HT : Hindustan Times
 # Loading newspaper text using docx2txt python library.
 THtext = docx2txt.process("TH.docx")
 TOItext = docx2txt.process("TOI.docx")
 IEtext = docx2txt.process("IE.docx")
 HTtext = docx2txt.process("HT.docx")

 # Pulling the images in respective folders
 THimages = docx2txt.process("TH.docx", os.getcwd() + "\\NLP_ExtractImages\\TH") 
 TOIimages = docx2txt.process("TOI.docx", os.getcwd() + "\\NLP_ExtractImages\\TOI") 
 IEimages = docx2txt.process("IE.docx", os.getcwd() + "\\NLP_ExtractImages\\IE") 
 HTimages = docx2txt.process("HT.docx", os.getcwd() + "\\NLP_ExtractImages\\HT") 

 # Cleaning  text
 import nltk 
 nltk.download('stopwords')
 from nltk.corpus import stopwords
 from nltk.stem.porter import PorterStemmer

 # RegEx to get all stats printed
 # RegEx to get all stats printed
 regex = re.compile('\d+\.?\d*')
 THStats = regex.findall(THtext)
 TOIStats = regex.findall(TOItext)
 IEStats = regex.findall(IEtext)
 HTStats = regex.findall(HTtext)
 Stats = [] 
 Stats.append(THStats)
 Stats.append(TOIStats)
 Stats.append(IEStats)
 Stats.append(HTStats)
 file = open("stats.pkl","wb")
 pickle.dump(Stats,file)
 file.close()

 corpus = []

 THtext = THtext.lower()
 THtext = re.sub('[^a-zA-Z]', ' ' , THtext)
 THtext = THtext.split()
 THtext = [word for word in THtext if len(set(word)) != 1 ]
 ps = PorterStemmer()
 THtext = [ps.stem(word) for word in THtext if word not in set(stopwords.words('english')) ]
 THtext = ' '.join(THtext)
 corpus.append(THtext)

 TOItext = TOItext.lower()
 TOItext = re.sub('[^a-zA-Z]', ' ' , TOItext)
 TOItext = TOItext.split()
 TOItext = [word for word in TOItext if len(set(word)) != 1 ]
 ps = PorterStemmer()
 TOItext = [ps.stem(word) for word in TOItext if word not in set(stopwords.words('english')) ]
 TOItext = ' '.join(TOItext)
 corpus.append(TOItext)

 IEtext = IEtext.lower()
 IEtext = re.sub('[^a-zA-Z]', ' ' , IEtext)
 IEtext = IEtext.split()
 IEtext = [word for word in IEtext if len(set(word)) != 1 ]
 ps = PorterStemmer()
 IEtext = [ps.stem(word) for word in IEtext if word not in set(stopwords.words('english')) ]
 IEtext = ' '.join(IEtext)
 corpus.append(IEtext)

 HTtext = HTtext.lower()
 HTtext = re.sub('[^a-zA-Z]', ' ' , HTtext)
 HTtext = HTtext.split()
 HTtext = [word for word in HTtext if len(set(word)) != 1 ]
 ps = PorterStemmer()
 HTtext = [ps.stem(word) for word in HTtext if word not in set(stopwords.words('english')) ]
 HTtext = ' '.join(HTtext)
 corpus.append(HTtext)

 file = open("corpus.pkl","wb")
 pickle.dump(corpus,file)
 file.close()
	# Importing the libraries
	import docx2txt
	import pickle
	import re

	# TH : The Hindu, TOI : Times of India, IE : Indian Express, HT : Hindustan Times
	# Loading newspaper text using docx2txt python library.
	THtext = docx2txt.process("TH.docx")
	TOItext = docx2txt.process("TOI.docx")
	IEtext = docx2txt.process("IE.docx")
	HTtext = docx2txt.process("HT.docx")

	# Pulling the images in respective folders
	THimages = docx2txt.process("TH.docx", os.getcwd() + "\\NLP_ExtractImages\\TH")
	TOIimages = docx2txt.process("TOI.docx", os.getcwd() + "\\NLP_ExtractImages\\TOI")
	IEimages = docx2txt.process("IE.docx", os.getcwd() + "\\NLP_ExtractImages\\IE")
	HTimages = docx2txt.process("HT.docx", os.getcwd() + "\\NLP_ExtractImages\\HT")

	# Cleaning text
	import nltk
	nltk.download('stopwords')
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer

	# RegEx to get all stats printed
	# RegEx to get all stats printed
	regex = re.compile('\d+\.?\d*')
	THStats = regex.findall(THtext)
	TOIStats = regex.findall(TOItext)
	IEStats = regex.findall(IEtext)
	HTStats = regex.findall(HTtext)
	Stats = []
	Stats.append(THStats)
	Stats.append(TOIStats)
	Stats.append(IEStats)
	Stats.append(HTStats)
	file = open("stats.pkl","wb")
	pickle.dump(Stats,file)
	file.close()

	corpus = []

	THtext = THtext.lower()
	THtext = re.sub('[^a-zA-Z]', ' ' , THtext)
	THtext = THtext.split()
	THtext = [word for word in THtext if len(set(word)) != 1 ]
	ps = PorterStemmer()
	THtext = [ps.stem(word) for word in THtext if word not in set(stopwords.words('english')) ]
	THtext = ' '.join(THtext)
	corpus.append(THtext)

	TOItext = TOItext.lower()
	TOItext = re.sub('[^a-zA-Z]', ' ' , TOItext)
	TOItext = TOItext.split()
	TOItext = [word for word in TOItext if len(set(word)) != 1 ]
	ps = PorterStemmer()
	TOItext = [ps.stem(word) for word in TOItext if word not in set(stopwords.words('english')) ]
	TOItext = ' '.join(TOItext)
	corpus.append(TOItext)

	IEtext = IEtext.lower()
	IEtext = re.sub('[^a-zA-Z]', ' ' , IEtext)
	IEtext = IEtext.split()
	IEtext = [word for word in IEtext if len(set(word)) != 1 ]
	ps = PorterStemmer()
	IEtext = [ps.stem(word) for word in IEtext if word not in set(stopwords.words('english')) ]
	IEtext = ' '.join(IEtext)
	corpus.append(IEtext)

	HTtext = HTtext.lower()
	HTtext = re.sub('[^a-zA-Z]', ' ' , HTtext)
	HTtext = HTtext.split()
	HTtext = [word for word in HTtext if len(set(word)) != 1 ]
	ps = PorterStemmer()
	HTtext = [ps.stem(word) for word in HTtext if word not in set(stopwords.words('english')) ]
	HTtext = ' '.join(HTtext)
	corpus.append(HTtext)

	file = open("corpus.pkl","wb")
	pickle.dump(corpus,file)
	file.close()