faisalnawazmir · October 22, 2020 22:09
diff --git a/NewsPaperEval.py b/NewsPaperEval.py
 # Evaluating 4 Indian English NewsPapers for 10th May 2020 for their 
 ## Vocabulary or No of Unique words per Paragraphs
 ## Factual Presentation
 ## Sentimental Analysis 
 ## Graphic content/ images : Needs preprocessing
 ## Visualising 

 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import os
 import re 
 import pickle

 from sklearn.feature_extraction import text
 from sklearn.feature_extraction.text import CountVectorizer

 file = open('corpus.pkl', 'rb')
 corpus = pickle.load(file)
 file.close()

 newspapers = ['The Hindu','Times Of India','Indian Express','Hindustan Times']

 cv = CountVectorizer(stop_words = 'english',ngram_range = (1,1) )
 docTermMatrix = cv.fit_transform(corpus).toarray()
 data_dtm = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
 data_dtm.index =  pd.Index(newspapers)
 data_dtm = data_dtm.transpose()

 # Checking out top 30 words for all newspapers
 top_dict = {}
 for c in data_dtm.columns:
    top = data_dtm[c].sort_values(ascending =False).head(30)
    top_dict[c] = list(zip(top.index,top.values))


 # checking top words collective in these and seeing top occurring words accross 
 words = []
 for newspaper in data_dtm.columns:
    top = [word for (word,count) in top_dict[newspaper]]
    for t in top:
        words.append(t)

 from collections import Counter 
 Counter(words).most_common()

 # adding them to stopwords list ( Anything common across all 4 newspapers)
 new_stop_words = [word for (word,count) in Counter(words).most_common() if count > 3]
 stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)
    
 cv = CountVectorizer(stop_words = stop_words,ngram_range = (1,1) )
 docTermMatrix = cv.fit_transform(corpus).toarray()
 data_stop = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
 data_stop.index =  pd.Index(newspapers)

 # Visualising top Words are Word Clouds
 from wordcloud import WordCloud
 wc = WordCloud(stopwords = stop_words, max_words=200, background_color = 'white', colormap = 'Dark2', max_font_size= 150, random_state=0)

 plt.rcParams['figure.figsize'] = [16,6]

 for i,newspaper in enumerate(data_dtm.columns):
    top = data_dtm[newspaper].sort_values(ascending=False).head(100)
    listOfWords = [ word for word in top.index ]
 
    wc.generate(' '.join(listOfWords)) 
    
    plt.subplot(3, 4, i+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(newspaper)

 # Getting unique words / Vocabulary
 unique_list = []
 for newspaper in data_dtm.columns:
    uniques = data_dtm[newspaper].to_numpy().nonzero()[0].size
    unique_list.append(uniques)

 unique_words = pd.DataFrame(list(zip(newspapers,unique_list)),columns = ['newspaper','unique_word'])
 #unique_words= unique_words.sort_values('unique_word',ascending = False)

 # Manually checked 
 NoOfPages = [ ['The Hindu',22], ['Times Of India',18], ['Indian Express',18],["Hindustan Times",16] ]
 NoOfPages = pd.DataFrame(NoOfPages, columns = ['Newspaper','PageCount'])
 NoOfPages = NoOfPages.transpose()

 # Unique words per page
 WPP = []
 for i,j in enumerate(NoOfPages):
    WPP.append( int(unique_words.unique_word[i] / NoOfPages[i].PageCount) )

 # Plotting Total Words 
 X = np.arange(4)
 plt.barh(X, unique_words.unique_word , align= 'center', alpha = 0.5)
 plt.yticks(X,newspapers)
 plt.xlabel("Unique Words")
 plt.title('Total Unique Words')
 plt.show()

 # Plotting Words per Page
 plt.barh(X, WPP , align= 'center', alpha = 0.5)
 plt.yticks(X,newspapers)
 plt.xlabel('Words Count')
 plt.title('Words per page')
 plt.show()

 # plotting stats per newspaper
 file = open('stats.pkl', 'rb')
 stats = pickle.load(file)
 file.close()
 statsLen = [len(li) for li in stats ]

 barlist = plt.barh(X, statsLen , align= 'center', alpha = 0.5)
 barlist[0].set_color('0.4')
 barlist[1].set_color('r')
 barlist[2].set_color('b')
 barlist[3].set_color('g')
 plt.yticks(X,newspapers)
 plt.xlabel('Numeric Figures used')
 plt.title('Numeric Figures used')
 plt.show()

 # Plotting Sentiment Analysis
 from textblob import TextBlob
 sentiment = []
 for i in np.arange(4):
    sentiment.append(TextBlob(corpus[i]).subjectivity)

 plt.scatter(X,sentiment,linewidths=5)
 plt.xticks(X,newspapers)
 plt.ylabel("<--Facts-----------------Opininios-->")
 plt.title("Subjectivity Graph")
 plt.show()

 # Calculating and Plotting Images Count
 imagesCount = []
 BasePath = os.getcwd() + "\\NLP_ExtractImages\\" 
 paths = [ BasePath + "\\TH\\", BasePath + "\\TOI\\" , BasePath + "\\IE\\", BasePath + "\\HT\\" ]
 for path in paths:
    os.scandir(path)
    counter = 0 
    for entry in os.scandir(path):
        size = entry.stat().st_size
        if size > 5000 :
            counter += 1
    imagesCount.append(counter)

 barlist = plt.bar(X, imagesCount , align= 'center', alpha = 0.5)
 barlist[0].set_color('0.4')
 barlist[1].set_color('r')
 barlist[2].set_color('b')
 barlist[3].set_color('g')
 plt.xticks(X,newspapers)
 plt.ylabel('No of Significant Images')
 plt.title('No of Significant Images')
 plt.show()
	# Evaluating 4 Indian English NewsPapers for 10th May 2020 for their
	## Vocabulary or No of Unique words per Paragraphs
	## Factual Presentation
	## Sentimental Analysis
	## Graphic content/ images : Needs preprocessing
	## Visualising

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import os
	import re
	import pickle

	from sklearn.feature_extraction import text
	from sklearn.feature_extraction.text import CountVectorizer

	file = open('corpus.pkl', 'rb')
	corpus = pickle.load(file)
	file.close()

	newspapers = ['The Hindu','Times Of India','Indian Express','Hindustan Times']

	cv = CountVectorizer(stop_words = 'english',ngram_range = (1,1) )
	docTermMatrix = cv.fit_transform(corpus).toarray()
	data_dtm = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
	data_dtm.index = pd.Index(newspapers)
	data_dtm = data_dtm.transpose()

	# Checking out top 30 words for all newspapers
	top_dict = {}
	for c in data_dtm.columns:
	top = data_dtm[c].sort_values(ascending =False).head(30)
	top_dict[c] = list(zip(top.index,top.values))


	# checking top words collective in these and seeing top occurring words accross
	words = []
	for newspaper in data_dtm.columns:
	top = [word for (word,count) in top_dict[newspaper]]
	for t in top:
	words.append(t)

	from collections import Counter
	Counter(words).most_common()

	# adding them to stopwords list ( Anything common across all 4 newspapers)
	new_stop_words = [word for (word,count) in Counter(words).most_common() if count > 3]
	stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words)

	cv = CountVectorizer(stop_words = stop_words,ngram_range = (1,1) )
	docTermMatrix = cv.fit_transform(corpus).toarray()
	data_stop = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names())
	data_stop.index = pd.Index(newspapers)

	# Visualising top Words are Word Clouds
	from wordcloud import WordCloud
	wc = WordCloud(stopwords = stop_words, max_words=200, background_color = 'white', colormap = 'Dark2', max_font_size= 150, random_state=0)

	plt.rcParams['figure.figsize'] = [16,6]

	for i,newspaper in enumerate(data_dtm.columns):
	top = data_dtm[newspaper].sort_values(ascending=False).head(100)
	listOfWords = [ word for word in top.index ]

	wc.generate(' '.join(listOfWords))

	plt.subplot(3, 4, i+1)
	plt.imshow(wc, interpolation="bilinear")
	plt.axis("off")
	plt.title(newspaper)

	# Getting unique words / Vocabulary
	unique_list = []
	for newspaper in data_dtm.columns:
	uniques = data_dtm[newspaper].to_numpy().nonzero()[0].size
	unique_list.append(uniques)

	unique_words = pd.DataFrame(list(zip(newspapers,unique_list)),columns = ['newspaper','unique_word'])
	#unique_words= unique_words.sort_values('unique_word',ascending = False)

	# Manually checked
	NoOfPages = [ ['The Hindu',22], ['Times Of India',18], ['Indian Express',18],["Hindustan Times",16] ]
	NoOfPages = pd.DataFrame(NoOfPages, columns = ['Newspaper','PageCount'])
	NoOfPages = NoOfPages.transpose()

	# Unique words per page
	WPP = []
	for i,j in enumerate(NoOfPages):
	WPP.append( int(unique_words.unique_word[i] / NoOfPages[i].PageCount) )

	# Plotting Total Words
	X = np.arange(4)
	plt.barh(X, unique_words.unique_word , align= 'center', alpha = 0.5)
	plt.yticks(X,newspapers)
	plt.xlabel("Unique Words")
	plt.title('Total Unique Words')
	plt.show()

	# Plotting Words per Page
	plt.barh(X, WPP , align= 'center', alpha = 0.5)
	plt.yticks(X,newspapers)
	plt.xlabel('Words Count')
	plt.title('Words per page')
	plt.show()

	# plotting stats per newspaper
	file = open('stats.pkl', 'rb')
	stats = pickle.load(file)
	file.close()
	statsLen = [len(li) for li in stats ]

	barlist = plt.barh(X, statsLen , align= 'center', alpha = 0.5)
	barlist[0].set_color('0.4')
	barlist[1].set_color('r')
	barlist[2].set_color('b')
	barlist[3].set_color('g')
	plt.yticks(X,newspapers)
	plt.xlabel('Numeric Figures used')
	plt.title('Numeric Figures used')
	plt.show()

	# Plotting Sentiment Analysis
	from textblob import TextBlob
	sentiment = []
	for i in np.arange(4):
	sentiment.append(TextBlob(corpus[i]).subjectivity)

	plt.scatter(X,sentiment,linewidths=5)
	plt.xticks(X,newspapers)
	plt.ylabel("<--Facts-----------------Opininios-->")
	plt.title("Subjectivity Graph")
	plt.show()

	# Calculating and Plotting Images Count
	imagesCount = []
	BasePath = os.getcwd() + "\\NLP_ExtractImages\\"
	paths = [ BasePath + "\\TH\\", BasePath + "\\TOI\\" , BasePath + "\\IE\\", BasePath + "\\HT\\" ]
	for path in paths:
	os.scandir(path)
	counter = 0
	for entry in os.scandir(path):
	size = entry.stat().st_size
	if size > 5000 :
	counter += 1
	imagesCount.append(counter)

	barlist = plt.bar(X, imagesCount , align= 'center', alpha = 0.5)
	barlist[0].set_color('0.4')
	barlist[1].set_color('r')
	barlist[2].set_color('b')
	barlist[3].set_color('g')
	plt.xticks(X,newspapers)
	plt.ylabel('No of Significant Images')
	plt.title('No of Significant Images')
	plt.show()
No results found