Created
May 19, 2020 07:08
-
-
Save getgimphed/e46082cabc1434d793f4889e4a5b7ac9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Evaluating 4 Indian English NewsPapers for 10th May 2020 for their | |
## Vocabulary or No of Unique words per Paragraphs | |
## Factual Presentation | |
## Sentimental Analysis | |
## Graphic content/ images : Needs preprocessing | |
## Visualising | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import os | |
import re | |
import pickle | |
from sklearn.feature_extraction import text | |
from sklearn.feature_extraction.text import CountVectorizer | |
file = open('corpus.pkl', 'rb') | |
corpus = pickle.load(file) | |
file.close() | |
newspapers = ['The Hindu','Times Of India','Indian Express','Hindustan Times'] | |
cv = CountVectorizer(stop_words = 'english',ngram_range = (1,1) ) | |
docTermMatrix = cv.fit_transform(corpus).toarray() | |
data_dtm = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names()) | |
data_dtm.index = pd.Index(newspapers) | |
data_dtm = data_dtm.transpose() | |
# Checking out top 30 words for all newspapers | |
top_dict = {} | |
for c in data_dtm.columns: | |
top = data_dtm[c].sort_values(ascending =False).head(30) | |
top_dict[c] = list(zip(top.index,top.values)) | |
# checking top words collective in these and seeing top occurring words accross | |
words = [] | |
for newspaper in data_dtm.columns: | |
top = [word for (word,count) in top_dict[newspaper]] | |
for t in top: | |
words.append(t) | |
from collections import Counter | |
Counter(words).most_common() | |
# adding them to stopwords list ( Anything common across all 4 newspapers) | |
new_stop_words = [word for (word,count) in Counter(words).most_common() if count > 3] | |
stop_words = text.ENGLISH_STOP_WORDS.union(new_stop_words) | |
cv = CountVectorizer(stop_words = stop_words,ngram_range = (1,1) ) | |
docTermMatrix = cv.fit_transform(corpus).toarray() | |
data_stop = pd.DataFrame(docTermMatrix,columns = cv.get_feature_names()) | |
data_stop.index = pd.Index(newspapers) | |
# Visualising top Words are Word Clouds | |
from wordcloud import WordCloud | |
wc = WordCloud(stopwords = stop_words, max_words=200, background_color = 'white', colormap = 'Dark2', max_font_size= 150, random_state=0) | |
plt.rcParams['figure.figsize'] = [16,6] | |
for i,newspaper in enumerate(data_dtm.columns): | |
top = data_dtm[newspaper].sort_values(ascending=False).head(100) | |
listOfWords = [ word for word in top.index ] | |
wc.generate(' '.join(listOfWords)) | |
plt.subplot(3, 4, i+1) | |
plt.imshow(wc, interpolation="bilinear") | |
plt.axis("off") | |
plt.title(newspaper) | |
# Getting unique words / Vocabulary | |
unique_list = [] | |
for newspaper in data_dtm.columns: | |
uniques = data_dtm[newspaper].to_numpy().nonzero()[0].size | |
unique_list.append(uniques) | |
unique_words = pd.DataFrame(list(zip(newspapers,unique_list)),columns = ['newspaper','unique_word']) | |
#unique_words= unique_words.sort_values('unique_word',ascending = False) | |
# Manually checked | |
NoOfPages = [ ['The Hindu',22], ['Times Of India',18], ['Indian Express',18],["Hindustan Times",16] ] | |
NoOfPages = pd.DataFrame(NoOfPages, columns = ['Newspaper','PageCount']) | |
NoOfPages = NoOfPages.transpose() | |
# Unique words per page | |
WPP = [] | |
for i,j in enumerate(NoOfPages): | |
WPP.append( int(unique_words.unique_word[i] / NoOfPages[i].PageCount) ) | |
# Plotting Total Words | |
X = np.arange(4) | |
plt.barh(X, unique_words.unique_word , align= 'center', alpha = 0.5) | |
plt.yticks(X,newspapers) | |
plt.xlabel("Unique Words") | |
plt.title('Total Unique Words') | |
plt.show() | |
# Plotting Words per Page | |
plt.barh(X, WPP , align= 'center', alpha = 0.5) | |
plt.yticks(X,newspapers) | |
plt.xlabel('Words Count') | |
plt.title('Words per page') | |
plt.show() | |
# plotting stats per newspaper | |
file = open('stats.pkl', 'rb') | |
stats = pickle.load(file) | |
file.close() | |
statsLen = [len(li) for li in stats ] | |
barlist = plt.barh(X, statsLen , align= 'center', alpha = 0.5) | |
barlist[0].set_color('0.4') | |
barlist[1].set_color('r') | |
barlist[2].set_color('b') | |
barlist[3].set_color('g') | |
plt.yticks(X,newspapers) | |
plt.xlabel('Numeric Figures used') | |
plt.title('Numeric Figures used') | |
plt.show() | |
# Plotting Sentiment Analysis | |
from textblob import TextBlob | |
sentiment = [] | |
for i in np.arange(4): | |
sentiment.append(TextBlob(corpus[i]).subjectivity) | |
plt.scatter(X,sentiment,linewidths=5) | |
plt.xticks(X,newspapers) | |
plt.ylabel("<--Facts-----------------Opininios-->") | |
plt.title("Subjectivity Graph") | |
plt.show() | |
# Calculating and Plotting Images Count | |
imagesCount = [] | |
BasePath = os.getcwd() + "\\NLP_ExtractImages\\" | |
paths = [ BasePath + "\\TH\\", BasePath + "\\TOI\\" , BasePath + "\\IE\\", BasePath + "\\HT\\" ] | |
for path in paths: | |
os.scandir(path) | |
counter = 0 | |
for entry in os.scandir(path): | |
size = entry.stat().st_size | |
if size > 5000 : | |
counter += 1 | |
imagesCount.append(counter) | |
barlist = plt.bar(X, imagesCount , align= 'center', alpha = 0.5) | |
barlist[0].set_color('0.4') | |
barlist[1].set_color('r') | |
barlist[2].set_color('b') | |
barlist[3].set_color('g') | |
plt.xticks(X,newspapers) | |
plt.ylabel('No of Significant Images') | |
plt.title('No of Significant Images') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment