Last active
April 17, 2018 20:50
-
-
Save alialavia/bbd2a7555901f70bae205e11539d5af6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
To evaluate the good or bad score of a tweet, we first tokenize the tweet, and then | |
stemmize each word in our tweet. We also associate each stem with positive and negative values, | |
respectively, using a dictionary. | |
Finally, we caculate the average word weight of a tweet, and decide if it's a good or bad one | |
based on that. | |
""" | |
import json | |
import html | |
import twitter | |
import time | |
from nltk import word_tokenize, pos_tag | |
from nltk.stem.porter import * | |
# Break down a string into words | |
def get_words(str): | |
useful_pos = {'NN'} | |
tokens = word_tokenize(str) | |
tags = pos_tag(tokens) | |
return [word for word, pos in tags if pos in useful_pos] | |
# Load a json object from a file | |
def load_json(json_file): | |
with open(json_file) as f: | |
return json.load(f) | |
# Calculate the average value of words in list_of_words | |
def get_average_word_weight(list_of_words, word_weights): | |
number_of_words = len(list_of_words) | |
sum_of_word_weights = 0.0 | |
print (number_of_words) | |
if number_of_words == 0: | |
return 0.0 | |
stemmer = PorterStemmer() | |
# Iterate through the words in the tweet string | |
for w in list_of_words: | |
stemmed_word = stemmer.stem(w) | |
if stemmed_word in word_weights: | |
sum_of_word_weights += word_weights[stemmed_word] | |
#else: | |
#missing_words[stemmed_word] = 0.0 | |
return sum_of_word_weights / number_of_words | |
# Analyse a tweet using a word-weight dictionary | |
def anaylse_tweet(tweet_string, word_weights): | |
words = get_words(tweet_string) | |
avg_tweet_weight = get_average_word_weight(words, word_weights) | |
print (tweet_string + ":" + str(avg_tweet_weight)) | |
# Load word weights and credentials from json files | |
word_weights = load_json("word_weights.json") | |
credentials = load_json(".cred.json") | |
# Connect to the twitter api | |
twitter_api = twitter.Api(consumer_key=credentials["consumer_key"], | |
consumer_secret=credentials["consumer_secret"], | |
access_token_key=credentials["access_token_key"], | |
access_token_secret=credentials["access_token_secret"], | |
tweet_mode='extended') | |
# Load last 10 statuses of Donald Trump | |
statuses = twitter_api.GetUserTimeline(screen_name="realDonaldTrump", count=10) | |
# Iterate through them and analyse them | |
for status in statuses: | |
anaylse_tweet(html.unescape(status.full_text), word_weights) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment