Last active
November 17, 2021 09:31
-
-
Save NewscatcherAPI/b5103a8e4542b45cdd1f43658a0eb1f6 to your computer and use it in GitHub Desktop.
Sentiment analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_text(text, all_mentions): | |
# If retweet, delete RT and name of the account | |
text = re.sub('(RT\s.*):', '', text) | |
# Find all links and delete them | |
all_links = re.findall('(https:.*?)\s', text + ' ') | |
for i in all_links: | |
text = text.replace(i, '') | |
for i in all_mentions: | |
text = text.replace('@' + i, '') | |
# Tokens | |
tokens = word_tokenize(text.replace('-', ' ')) | |
# convert to lower case | |
tokens = [w.lower() for w in tokens ] | |
# remove punctuation from each word | |
table = str.maketrans('', '', string.punctuation) | |
stripped = [w.translate(table) for w in tokens] | |
# remove remaining tokens that are not alphabetic | |
words = [word for word in stripped if word.isalpha()] | |
# filter out stop words | |
stop_words = set(stopwords.words('english')) | |
words = [w for w in words if not w in stop_words] | |
phrase = " ".join(words) | |
return phrase, all_links | |
for i in results_apple: | |
i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']]) | |
for i in results_facebook: | |
i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']]) | |
for i in results_amazon: | |
i['clean_text'], i['all_link'] = clean_text(i['full_text'], [j['screen_name'] for j in i['entities']['user_mentions']]) | |
results_apple[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
variables_we_need = ['created_at', 'id', 'full_text', 'entities', 'user', 'coordinates', 'retweet_count', 'favorite_count', 'lang'] | |
def get_all_tweets(count=100, q='', lang='', since='', tweet_mode='extended'): | |
results = [] | |
tweets = tweepy.Cursor(api.search, q=q, lang=lang, since=since, tweet_mode=tweet_mode).items(count) | |
for tweet in tweets: | |
d = {} | |
for variable in variables_we_need: | |
d[variable] = tweet._json[variable] | |
results.append(d) | |
return results |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
results_apple = get_all_tweets(count=1000, q='apple', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31') | |
results_facebook = get_all_tweets(count=1000, q='facebook', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31') | |
results_amazon = get_all_tweets(count=1000, q='amazon', tweet_mode='extended', lang='en', since='2021-10-25', until='2021-10-31') | |
results_apple[0] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tweets = tweepy.Cursor(api.search, q='Apple', tweet_mode='extended').items(1) | |
one_tweet = tweet[-1] | |
one_tweet._json |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download and load FinBert pretrained model | |
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert") | |
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert") | |
nlp = pipeline("sentiment-analysis", model = model, tokenizer=tokenizer) | |
possible_sentiments = ['negative', 'neutral', 'positive'] | |
# Get sentiments | |
def get_sentiments(input_dict, variable_text): | |
for item_ in input_dict: | |
sentiment = sentiment_analysis(item_[variable_text]) | |
for item in sentiment: | |
for shade in possible_sentiments: | |
if item['label'] == shade: | |
item_[shade] = item['score'] | |
else: | |
item_[shade] = 0 | |
return input_dict | |
results_apple = get_sentiments(results_apple, 'clean_text') | |
results_facebook = get_sentiments(results_facebook, 'clean_text') | |
results_amazon = get_sentiments(results_amazon, 'clean_text') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
trends_result = api.trends_place(id=2459115)[0]['trends'] | |
trends = {} | |
for i in trends_result: | |
trends[i['name']] = i['tweet_volume'] | |
trends_names = ' '.join(list(trends.keys())) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import tweepy | |
from tweepy import OAuthHandler | |
import numpy as np | |
import pandas as pd | |
# text treatement | |
import nltk | |
from nltk.tokenize import word_tokenize | |
import string | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
from nltk.sentiment.vader import SentimentIntensityAnalyzer | |
#Wordcloud | |
from wordcloud import WordCloud, ImageColorGenerator | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
# Graphs | |
import plotly.io as pio | |
pio.renderers.default='browser' | |
import plotly.express as px | |
# transformers | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install -r requirements.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import Packages | |
from newscatcherapi import NewsCatcherApiClient | |
import time | |
# Initialize NewsCatcher API | |
newscatcherapi = NewsCatcherApiClient(x_api_key='YOUR-X-API-KEY') | |
# Extract News | |
apple_articles = [] | |
facebook_articles = [] | |
amazon_articles = [] | |
for i in range(1, 11): | |
apple_articles.extend(newscatcherapi.get_search(q='(Apple AND company) OR "Apple Inc"', | |
lang='en', | |
from_='2021-10-25', | |
to_='2021-10-31', | |
page_size=100, | |
page=i)['articles']) | |
time.sleep(1) | |
facebook_articles.extend(newscatcherapi.get_search(q='(Facebook AND company) OR "Facebook Inc"', | |
lang='en', | |
from_='2021-10-25', | |
to_='2021-10-31', | |
page_size=100, | |
page=i)['articles']) | |
time.sleep(1) | |
amazon_articles.extend(newscatcherapi.get_search(q='(Amazon AND company) OR "Amazon Inc"', | |
lang='en', | |
from_='2021-10-25', | |
to_='2021-10-31', | |
page_size=100, | |
page=i)['articles']) | |
time.sleep(1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Image of corona | |
hashtag = np.array(Image.open("hashtag.jpg")) | |
hashtag[hashtag == 0] = 255 | |
wordcloud = WordCloud(background_color="white",max_words=200, mask=hashtag, contour_width=3, contour_color='firebrick', collocations=False).generate(trends_names) | |
plt.figure(figsize=[20,10]) | |
plt.imshow(wordcloud, interpolation="bilinear") | |
plt.axis("off") | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
consumer_key = os.environ['CONSUMER_KEY'] | |
consumer_secret = os.environ['CONSUMER_SECRET'] | |
access_token = os.environ['ACCESS_TOKEN'] | |
access_token_secret = os.environ['ACCESS_SECRET'] | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apple_articles_pd = pd.DataFrame(get_sentiments(apple_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']] | |
facebook_articles_pd = pd.DataFrame(get_sentiments(facebook_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']] | |
amazon_articles_pd = pd.DataFrame(get_sentiments(amazon_articles, 'title')).loc[:, ['negative', 'neutral', 'positive']] | |
total_score_articles = pd.concat([apple_articles_pd.mean(), facebook_articles_pd.mean(), amazon_articles_pd.mean()], axis=1) | |
total_score_articles = total_score_articles.transpose() | |
total_score_articles = total_score_articles.reset_index() | |
total_score_articles.columns = ['Company', 'negative', 'neutral', 'positive'] | |
total_score_articles['Company'] = ['Apple', 'Facebook', 'Amazon'] | |
# Sentiment Score | |
total_score_articles | |
# Graph | |
fig = px.histogram(total_score_articles, | |
x='Company', | |
title='Sentiment Score by Company | News Articles', | |
y= ['negative', 'neutral','positive'], | |
barmode='group', | |
color_discrete_sequence=["red", "blue", "green"]) | |
fig.update_xaxes( title='Companies').update_yaxes(title='Sentiment score') | |
fig.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create Dataframes | |
apple_tweets_pd = pd.DataFrame(results_apple).loc[:, ['negative', 'neutral', 'positive']] | |
facebook_tweets_pd = pd.DataFrame(results_facebook).loc[:, ['negative', 'neutral', 'positive']] | |
amazon_tweets_pd = pd.DataFrame(results_amazon).loc[:, ['negative', 'neutral', 'positive']] | |
# Concatanate | |
total_score_tweets = pd.concat([apple_tweets_pd.mean(), facebook_tweets_pd.mean(), amazon_tweets_pd.mean()], axis=1) | |
total_score_tweets = total_score_tweets.transpose() | |
total_score_tweets = total_score_tweets.reset_index() | |
total_score_tweets.columns = ['Company', 'negative', 'neutral', 'positive'] | |
total_score_tweets['Company'] = ['Apple', 'Facebook', 'Amazon'] | |
total_score_tweets | |
# Visualize | |
fig = px.histogram(total_score_tweets, | |
x='Company', | |
title='Sentiment Score by Company | Tweets', | |
y= ['negative', 'neutral','positive'], | |
barmode='group', | |
color_discrete_sequence=["red", "blue", "green"]) | |
fig.update_xaxes( title='Companies').update_yaxes(title='Sentiment score') | |
fig.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment