Skip to content

Instantly share code, notes, and snippets.

@jrjames83
Created February 27, 2017 00:06
Show Gist options
  • Save jrjames83/3772e6b33ba0556bcae44e2c8a37ed04 to your computer and use it in GitHub Desktop.
Save jrjames83/3772e6b33ba0556bcae44e2c8a37ed04 to your computer and use it in GitHub Desktop.
Python Twitter / Tweepy, Plus Sklearn Count Vectorizer Detailed Walk Through and TextBlob Sentiment Analysis
import tweepy
import requests
import pandas as pd
import json
#Twitter API credentials
consumer_key = "get"
consumer_secret = "your"
access_key = "own"
access_secret = "creds"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def get_user_tweets(username):
    tweets = api.user_timeline(screen_name=username, count=200)
    return tweets

trump = get_user_tweets("realDonaldTrump")
taleb = get_user_tweets("nntaleb")
google = get_user_tweets("google")

all_tweets = trump + google + taleb
# Quickly examine the data returned
# text, favorite_count, created_at, user.screen_name, retweet_count
trump[0]._json
{'contributors': None,
 'coordinates': None,
 'created_at': 'Sat Feb 25 22:02:22 +0000 2017',
 'entities': {'hashtags': [], 'symbols': [], 'urls': [], 'user_mentions': []},
 'favorite_count': 63532,
 'favorited': False,
 'geo': None,
 'id': 835610917568200705,
 'id_str': '835610917568200705',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'retweet_count': 16432,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'text': 'Congratulations to Thomas Perez, who has just been named Chairman of the DNC. I could not be happier for him, or for the Republican Party!',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Wed Mar 18 13:46:38 +0000 2009',
  'default_profile': False,
  'default_profile_image': False,
  'description': '45th President of the United States of America',
  'entities': {'description': {'urls': []}},
  'favourites_count': 45,
  'follow_request_sent': False,
  'followers_count': 25550879,
  'following': False,
  'friends_count': 43,
  'geo_enabled': True,
  'has_extended_profile': False,
  'id': 25073877,
  'id_str': '25073877',
  'is_translation_enabled': True,
  'is_translator': False,
  'lang': 'en',
  'listed_count': 65153,
  'location': 'Washington, DC',
  'name': 'Donald J. Trump',
  'notifications': False,
  'profile_background_color': '6D5C18',
  'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg',
  'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg',
  'profile_background_tile': True,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/25073877/1485301108',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg',
  'profile_link_color': '0D5B73',
  'profile_sidebar_border_color': 'BDDCAD',
  'profile_sidebar_fill_color': 'C5CEC0',
  'profile_text_color': '333333',
  'profile_use_background_image': True,
  'protected': False,
  'screen_name': 'realDonaldTrump',
  'statuses_count': 34539,
  'time_zone': 'Eastern Time (US & Canada)',
  'translator_type': 'regular',
  'url': None,
  'utc_offset': -18000,
  'verified': True}}
for_df = []

for t in all_tweets:
    for_df.append({
            "user": t.user.screen_name, 
            "text": t.text,
            "created_at": t.created_at,
            "retweets": t.retweet_count
        })
    
df = pd.DataFrame.from_records(for_df)
df.head(10)
created_at retweets text user
0 2017-02-25 22:02:22 16432 Congratulations to Thomas Perez, who has just ... realDonaldTrump
1 2017-02-25 21:53:21 14973 I will not be attending the White House Corres... realDonaldTrump
2 2017-02-25 18:12:25 6836 Weekly Address from @WhiteHouse: https://t.co/... realDonaldTrump
3 2017-02-25 13:27:04 15695 Great optimism for future of U.S. business, AN... realDonaldTrump
4 2017-02-25 13:19:18 47204 The media has not reported that the National D... realDonaldTrump
5 2017-02-25 12:25:24 27019 Maybe the millions of people who voted to MAKE... realDonaldTrump
6 2017-02-25 03:09:18 23441 FAKE NEWS media knowingly doesn't tell the tru... realDonaldTrump
7 2017-02-24 17:04:19 17354 Trump vows to fight 'epidemic' of human traffi... realDonaldTrump
8 2017-02-24 13:49:27 8493 Going to CPAC! realDonaldTrump
9 2017-02-24 12:36:34 20371 find the leakers within the FBI itself. Classi... realDonaldTrump
df.shape
(600, 4)
# Check tweets by user, some useful pandas filtering techniques
print(len(df[df['user'] == 'nntaleb']['text'].unique())) #confirm all are unique
df.groupby('user').count()
200
created_at retweets text
user
Google 200 200 200
nntaleb 200 200 200
realDonaldTrump 200 200 200

Which tweets are most similar?

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from collections import defaultdict


documents = df['text'].tolist() #get all tweets into a list

# Since tweets are so short, not too concerned about inv doc freq normalization schemes
# Just want to find tweets that share similar words, 
# Since tweets are short, bigram range helps expand vocabulary, which ends up haing 6400 words appx

# Aside: why would a bigram vectorizer make a diff if the unigrams are counted once anyway..?
#   .....well, you get 3 votes, one for each unigram and one for the bigram....so, yeah

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(documents)
X.shape
(600, 6404)
# How to access the vocabulary (handy to know)
vocab = dict(zip(vectorizer.get_feature_names(), X))

# Check some of the longer bigrams....
sorted(vocab.items(), key=lambda x: len(x[0]), reverse=True)[:5]
[('assalehamer ctheofilopoulos',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 3 stored elements in Compressed Sparse Row format>),
 ('a_epiphanes4 ryansroberts',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 23 stored elements in Compressed Sparse Row format>),
 ('8z4rob3sf5 dominikleusder',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 23 stored elements in Compressed Sparse Row format>),
 ('americans overwhelmingly',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 17 stored elements in Compressed Sparse Row format>),
 ('artandapostasy franklin',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 9 stored elements in Compressed Sparse Row format>)]
# better illustrate the overall concept - take 5 tweets and limit features to 20 terms

sample = df['text'].tolist()[:5]
vect = CountVectorizer(min_df=0., max_df=1.0, max_features=20)
Z = vect.fit_transform(sample)
# Original concept https://gist.github.com/larsmans/3745866
print(pd.DataFrame(Z.A, columns=vect.get_feature_names()).to_string())

print("--------------------------------------------------")
print(sample[0])
# sample 0 does not have "11th", "and", but contains "be", has "for" 2x, etc...
# note relationship to the upper "matrix" which has a row for each docc and a column for each term
# the i,jth value if the frequency of that term in the document
   11th  and  be  billion  co  first  for  great  has  https  in  national  not  obama  of  optimism  or  party  perez  the
0     0    0   1        0   0      0    2      0    1      0   0         0    1      0   1         0   1      1      1    2
1     0    1   1        0   0      0    0      1    0      0   0         0    1      0   0         0   0      0      0    1
2     0    0   0        0   2      0    0      0    0      2   0         0    0      0   0         0   0      0      0    0
3     1    1   0        0   0      0    1      1    0      0   0         0    0      0   1         1   0      0      0    1
4     0    0   0        2   0      2    0      0    1      0   2         1    1      1   0         0   0      0      0    2
--------------------------------------------------
Congratulations to Thomas Perez, who has just been named Chairman of the DNC. I could not be happier for him, or for the Republican Party!
# For each of the 600 tweets, we want to find the few most relevant tweets
# Default dict which keys off the index of the dataframe (effectively)
# X[x] is the xth entry in the count vectorized matrix X

store = defaultdict(list)

for x in range(600):
    store[x].extend(linear_kernel(X[x], X).flatten().argsort()[:-5:-1])
    
# Most common elements are the element indices themselves
print (store[0])
print (store[1])
print (store[2])
[0, 19, 141, 258]
[1, 68, 178, 24]
[2, 18, 411, 55]

Linear Kernel, WTF?

  • Fear not....below we explore what it all means.
  • Here's the gist: you have your document row, with a number (0 or N) depending on if the tweet (document) had the term (column index) in it. Great, so you take that row as a vector (1 x nbr of terms (columns)) and you multiple it by the large overall matrix.
  • In order to make the matrix multiplication work you need to transpose the large matrix so you end up with a multiplication result that's the size of the nbr of tweets.
  • Each entry in the resulting multiply is a similarity measure of the document in question and all other documents.
  • Below we review the idea with the linear_kernel from sklearn, or just using numpy linear algebra operations
# Let's scope out the 33rd tweet and find some similar tweets - here use the linear_kernel function from sklearn
print(linear_kernel(X[33], X).flatten().argsort()[:-5:-1])

# Here's the same thing, but we use numpy to dot the 33rd row of X against the transpose of X. 
# Keep in mind X is just the Count Vectorized Matrix
np.dot(X[33], X.T).toarray().flatten().argsort()[:-5:-1]
[ 33  51 423  88]





array([ 33,  51, 423,  88], dtype=int64)
# Let's look at the 33rd, 51st, 423rd and 88th tweets
# They all seem to have "Trump" in Common
# They all have a twitter short link, this could evolve into a stopword 
# in a "serious analysis"

for idx in [33, 51, 423, 88]:
    print(idx, df.iloc[idx]['text'])
33 'Trump signs bill undoing Obama coal mining rule' https://t.co/yMfT5r5RGh
51 'Remarks by President Trump at Signing of H.J. Resolution 41'
https://t.co/Q3MoCGAc54 https://t.co/yGDDTKm9Br
423 RT @normonics: The Minority Rule. cc @nntaleb https://t.co/RMCNdH8LMG
88 'Majority in Leading EU Nations Support Trump-Style Travel Ban' 
Poll of more than 10,000 people in 10 countries...https://t.co/KWsIWhtC9o

Argsort, negative list slicing??

  • OK, let's step through it from scratch using the 33rd tweet again
# Keep going with the 33rd tweeet
np.dot(X[33], X.T).toarray().flatten()[:50]
# Notice that there's a score of 17 in the middle'ish...could that be the 33rd position?
array([ 0,  0,  2,  0,  1,  0,  0,  2,  0,  0,  0,  0,  0,  2,  0,  0,  0,
        1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0, 17,
        0,  0,  0,  0,  0,  1,  1,  1,  2,  1,  0,  2,  0,  0,  0,  0], dtype=int64)
np.dot(X[33], X.T).toarray().flatten()[33] 
# Aha! Yes, wait, when you dot the thing with itse own row you get 
# a score proportional to the length of the item
17

Inverse Transform, a method on the vectorizer that provides non-zero entries the tweet had.....it's also the score above

vectorizer.inverse_transform(X)[33].shape # INTERESTING!!!!!
(17,)
# Back to argsort - note the very last elements
# This is providing you the indices of the values based on their rank
# for instance, the highest scores are the very last ones (note how 33 is last, the value behind it is 17)
np.dot(X[33], X.T).toarray().flatten().argsort()
array([  0, 355, 354, 353, 352, 351, 349, 348, 347, 346, 344, 343, 341,
       357, 340, 338, 337, 335, 334, 332, 331, 330, 328, 327, 326, 325,
       324, 339, 358, 359, 360, 397, 395, 393, 392, 391, 390, 389, 388,
       387, 386, 385, 384, 383, 382, 380, 379, 378, 375, 371, 370, 368,
       367, 366, 365, 364, 362, 361, 323, 399, 322, 320, 280, 279, 278,
       277, 276, 275, 274, 272, 271, 268, 266, 265, 281, 264, 261, 260,
       259, 258, 257, 255, 254, 253, 251, 250, 249, 247, 263, 282, 283,
       284, 318, 317, 316, 315, 313, 312, 311, 310, 309, 308, 307, 306,
       305, 304, 303, 300, 298, 296, 295, 294, 292, 291, 289, 288, 287,
       286, 285, 321, 246, 400, 402, 533, 532, 530, 529, 526, 525, 524,
       523, 522, 520, 519, 518, 534, 517, 513, 511, 509, 504, 503, 502,
       501, 500, 499, 498, 496, 495, 515, 535, 539, 541, 596, 594, 592,
       590, 587, 583, 580, 578, 577, 576, 575, 574, 572, 571, 570, 568,
       567, 566, 565, 564, 560, 557, 555, 554, 551, 543, 542, 493, 401,
       492, 487, 440, 439, 438, 437, 436, 435, 434, 433, 432, 431, 428,
       427, 441, 426, 422, 417, 415, 414, 412, 410, 408, 407, 406, 405,
       404, 403, 424, 442, 444, 445, 485, 481, 480, 479, 478, 477, 476,
       475, 472, 471, 469, 468, 467, 466, 463, 462, 461, 459, 458, 457,
       456, 453, 451, 450, 449, 448, 447, 491, 244, 299, 242, 140, 139,
       137, 136, 135,  44, 133, 132, 131,  38, 130, 127,  46, 125, 124,
        47, 123, 122, 121, 120, 129, 119,  37,  36,  29,  30, 160, 159,
        31, 158,  32, 156, 155, 141, 154,  34, 150, 149, 148,  35, 147,
       146, 145, 144, 153, 165, 118, 116,  91,  60,  61,  62,  85,  84,
        83,  82,  81,  59,  63,  64,  78,  65,  68,  69,  75,  74,  73,
        72,  79, 117,  93,  96, 115, 114, 113,  48, 112, 111, 110, 109,
       108,  95,  49, 107, 106,  53, 105, 104, 103, 101, 100,  97, 243,
       166, 152,  27, 215,  10, 213, 212, 211, 210,  11,  12, 167, 206,
       205, 204,  14, 203, 202,  15,  16,  19,  20,   9,  21, 217, 219,
         1, 241, 239, 238, 237,   3, 236, 235, 232, 231, 229,   5, 227,
       225, 224,   6, 222,   8, 220, 218, 196, 207, 194, 184,  24, 183,
       182, 181, 180,  23, 179, 195, 174, 172, 171,  26, 169,  25,  22,
        70, 191, 193, 189, 192, 190, 483, 545, 484, 516, 588, 573, 482,
       488, 489, 490, 586, 548, 547, 486, 589,   4, 591, 563, 544, 536,
       537,  66, 538, 562, 593, 595, 540, 474, 585, 549, 584,  39, 559,
       558, 556, 527, 528, 508,  40, 507, 521, 506, 510, 505,  41,  50,
        58, 569, 512, 552, 473, 579,  17,  43, 497,  56, 581, 582,  57,
       494, 531, 550,  52,  54, 421, 470, 198, 197, 188, 314, 187, 186,
       185, 319, 178, 177, 176, 302, 175, 173, 333, 170, 336, 168, 342,
       164, 163, 345, 161, 350, 329, 301, 199, 200, 245, 240, 248, 252,
       234, 256, 233, 230, 262, 228, 226, 267, 269, 270, 223, 273, 221,
       216, 214, 209, 208, 290, 293, 201, 297, 157, 356, 162, 363, 418,
       419, 420, 151, 425, 429, 430, 102,  99,  98,  94, 443,  90, 446,
        86, 452, 455,  80, 460,  77,  76, 464, 465, 416, 413, 599, 409,
       143, 369, 142, 372, 373, 374, 376, 377, 381, 134, 128, 138, 126,
       398, 396, 394, 597,   2,  67, 561,  71, 553, 454,  55,  87,  45,
        28,  89,   7,  18, 411,  92,  42, 598, 546, 514,  13,  88, 423,
        51,  33], dtype=int64)
# Let's do some list slicing fun
# ::-1 reverses the list and :5 provides the first 5 elements of the reverse sort!
# Hopefully that made some sense!
np.dot(X[33], X.T).toarray().flatten().argsort()[::-1][:4]
array([ 33,  51, 423,  88], dtype=int64)

A semi-pythonic way to get the similar tweets ready to be incorporated back into the dataframe

rel_tweets = defaultdict(list)

for k,v in store.items():
    for tweet in v:
        rel_tweets[k].append(df.iloc[tweet]['text'])


# Check out some related tweets        
print("\n".join(rel_tweets[0])) # "named" is in common, "like" is in common......

print("--------------------------------------------------")

# use a set to find intersection of terms
print("\n".join(rel_tweets[9])) #"given" appears twice, "classified information" in 2 tweets, etc...
Congratulations to Thomas Perez, who has just been named Chairman of the DNC. I could not be happier for him, or for the Republican Party!
Just named General H.R. McMaster National Security Advisor.
Nancy Pelosi and Fake Tears Chuck Schumer held a rally  at the steps of The Supreme Court and mic did not work (a mess)-just like Dem party!
@goochthegreat Hi there. We'd like to help. Just to confirm, are you still able to sign into your account? Let us know.
--------------------------------------------------
find the leakers within the FBI itself. Classified information is being given to media that could have a devastating effect on U.S. FIND NOW
The real scandal here is that classified information is illegally given out by "intelligence" like candy. Very un-American!
Information is being illegally given to the failing @nytimes &amp; @washingtonpost by the intelligence community (NSA and FBI?).Just like Russia
The FBI is totally unable to stop the national security "leakers" that have permeated our government for a long time. They can't even......
# I do not feel like joining it back in however via a Series - let's just use the index of the dataframe
# sincee we know the X matrix is ordered row wise identically to the dataframe

# Maybe a map function will help us here (use a lambda z)
# You can call index on any dataframe with an index
df['rel_tweets'] = df.index.map(lambda z: np.dot(X[z], X.T).toarray().flatten().argsort()[::-1][:4])
df.head()
created_at retweets text user rel_tweets
0 2017-02-25 22:02:22 16432 Congratulations to Thomas Perez, who has just ... realDonaldTrump [0, 19, 141, 258]
1 2017-02-25 21:53:21 14973 I will not be attending the White House Corres... realDonaldTrump [1, 68, 178, 24]
2 2017-02-25 18:12:25 6836 Weekly Address from @WhiteHouse: https://t.co/... realDonaldTrump [2, 18, 411, 55]
3 2017-02-25 13:27:04 15695 Great optimism for future of U.S. business, AN... realDonaldTrump [3, 124, 38, 119]
4 2017-02-25 13:19:18 47204 The media has not reported that the National D... realDonaldTrump [4, 90, 129, 45]

The nice thing about dataframes is that rel_tweets is actually a list.

  • you can perform the same operations on it as you would a list without intermediate conversions (string to split) or other operations
  • now we can lookup the tweet text based on the tweet indices!
  • we'll use a function called "get_rel_tweets" which will take a row/column from the dataframe and look-up the tweets by their index and return them in a list comprehension, for handy use down the line
def get_rel_tweets(row):
    vals = row[1:] #ignore the first entry which is itself
    return [df.iloc[x]['text'] for x in vals]

df['rel_tweet_text'] = df['rel_tweets'].map(get_rel_tweets)

df.head()
created_at retweets text user rel_tweets rel_tweet_text
0 2017-02-25 22:02:22 16432 Congratulations to Thomas Perez, who has just ... realDonaldTrump [0, 19, 141, 258] [Just named General H.R. McMaster National Sec...
1 2017-02-25 21:53:21 14973 I will not be attending the White House Corres... realDonaldTrump [1, 68, 178, 24] [A working dinner tonight with Prime Minister ...
2 2017-02-25 18:12:25 6836 Weekly Address from @WhiteHouse: https://t.co/... realDonaldTrump [2, 18, 411, 55] [Congratulations to our new National Security ...
3 2017-02-25 13:27:04 15695 Great optimism for future of U.S. business, AN... realDonaldTrump [3, 124, 38, 119] [Meeting with biggest business leaders this mo...
4 2017-02-25 13:19:18 47204 The media has not reported that the National D... realDonaldTrump [4, 90, 129, 45] [Thank you Brian Krzanich, CEO of @Intel. A gr...
# Let's spotcheck one of them
df.tail()
created_at retweets text user rel_tweets rel_tweet_text
595 2017-02-09 19:11:08 2 @CutTheKnotMath Voila. Could not find a clean ... nntaleb [595, 573, 470, 418] [@CutTheKnotMath Voila. https://t.co/xUNcf6Rna...
596 2017-02-09 16:18:28 0 @CutTheKnotMath Would a solution via calculus ... nntaleb [596, 421, 502, 503] [We can visualize the solution obtained via ca...
597 2017-02-09 13:01:42 85 2) Hence low risks of violence/ unrest. In spi... nntaleb [597, 553, 51, 474] [We think alike. From Silent Risk (this was 20...
598 2017-02-09 12:34:14 85 1)Cond. Prob.\nConditional on having voted Tru... nntaleb [598, 458, 77, 405] [@CutTheKnotMath @ctrzcinka \n2nd route: Rearr...
599 2017-02-09 02:42:23 251 Indeed. https://t.co/cp67FVtk28 nntaleb [599, 55, 42, 51] [Congratulations Treasury Secretary Steven Mnu...
print(df.iloc[595]['text'])
print(df.iloc[595]['rel_tweet_text'])
# Here, cuttheknotmath is driving the text similarity
@CutTheKnotMath Voila. Could not find a clean inequality. https://t.co/F0UwcEIEVj
['@CutTheKnotMath Voila. https://t.co/xUNcf6RnaI', 'The smell of mathematical inequality on Sunday evening. @CutTheKnotMath https://t.co/pKbrczGMII', "Interesting discussion around the speculation of link between lifespan hearbeats/breathing and Jensen's inequality.… https://t.co/qqRJpEmbfA"]

Wait, what about the scores? How related are each of the tweets?

  • Let's go back to our dot product with the similarity scores , say for the 33rd tweet
np.dot(X[33], X.T).toarray().flatten().argsort()[::-1][:4]
array([ 33,  51, 423,  88], dtype=int64)
# How we want the values of np.dot(X[33], X.T).toarray().flatten() as a data structure
# but at the above indices....it's pretty easy
# the below kinda looks crazy, but that's because I didn't use any variable names
np.dot(X[33], X.T).toarray().flatten()[[np.dot(X[33], X.T).toarray().flatten().argsort()[::-1][:4]]]
array([17,  3,  2,  2], dtype=int64)
# So above we have the best score, then the other 2 scores? How can we get this into the dataframe now?
df['scores'] = df.index.map(lambda j: 
                            np.dot(X[j], X.T).toarray().flatten()[[np.dot(X[j], X.T).toarray().flatten().argsort()[::-1][:4]]])

df.head(10)
created_at retweets text user rel_tweets rel_tweet_text scores
0 2017-02-25 22:02:22 16432 Congratulations to Thomas Perez, who has just ... realDonaldTrump [0, 19, 141, 258] [Just named General H.R. McMaster National Sec... [19, 3, 2, 1]
1 2017-02-25 21:53:21 14973 I will not be attending the White House Corres... realDonaldTrump [1, 68, 178, 24] [A working dinner tonight with Prime Minister ... [19, 4, 3, 3]
2 2017-02-25 18:12:25 6836 Weekly Address from @WhiteHouse: https://t.co/... realDonaldTrump [2, 18, 411, 55] [Congratulations to our new National Security ... [15, 4, 4, 4]
3 2017-02-25 13:27:04 15695 Great optimism for future of U.S. business, AN... realDonaldTrump [3, 124, 38, 119] [Meeting with biggest business leaders this mo... [33, 4, 3, 3]
4 2017-02-25 13:19:18 47204 The media has not reported that the National D... realDonaldTrump [4, 90, 129, 45] [Thank you Brian Krzanich, CEO of @Intel. A gr... [29, 2, 2, 2]
5 2017-02-25 12:25:24 27019 Maybe the millions of people who voted to MAKE... realDonaldTrump [5, 123, 115, 20] [Professional anarchists, thugs and paid prote... [17, 11, 5, 5]
6 2017-02-25 03:09:18 23441 FAKE NEWS media knowingly doesn't tell the tru... realDonaldTrump [6, 27, 49, 63] [The FAKE NEWS media (failing @nytimes, @NBCNe... [31, 9, 7, 7]
7 2017-02-24 17:04:19 17354 Trump vows to fight 'epidemic' of human traffi... realDonaldTrump [7, 51, 87, 514] ['Remarks by President Trump at Signing of H.J... [15, 3, 2, 2]
8 2017-02-24 13:49:27 8493 Going to CPAC! realDonaldTrump [8, 376, 183, 236] [@chuurpy Hi there. By chance, did you already... [3, 1, 1, 1]
9 2017-02-24 12:36:34 20371 find the leakers within the FBI itself. Classi... realDonaldTrump [9, 44, 47, 10] [The real scandal here is that classified info... [15, 4, 3, 2]
from textblob import TextBlob
# Get the sentiment for each tweet
df['sentiment'] = df['text'].map(lambda x: TextBlob(x).sentiment)
df['polarity'] = df['sentiment'].map(lambda x: x[0])
df['subjectivity'] = df['sentiment'].map(lambda x: x[1])
df.head()
created_at retweets text user rel_tweets rel_tweet_text scores sentiment polarity subjectivity
0 2017-02-25 22:02:22 16432 Congratulations to Thomas Perez, who has just ... realDonaldTrump [0, 19, 141, 258] [Just named General H.R. McMaster National Sec... [19, 3, 2, 1] (0.0, 0.0) 0.000000 0.000000
1 2017-02-25 21:53:21 14973 I will not be attending the White House Corres... realDonaldTrump [1, 68, 178, 24] [A working dinner tonight with Prime Minister ... [19, 4, 3, 3] (0.5, 0.375) 0.500000 0.375000
2 2017-02-25 18:12:25 6836 Weekly Address from @WhiteHouse: https://t.co/... realDonaldTrump [2, 18, 411, 55] [Congratulations to our new National Security ... [15, 4, 4, 4] (0.0, 0.0) 0.000000 0.000000
3 2017-02-25 13:27:04 15695 Great optimism for future of U.S. business, AN... realDonaldTrump [3, 124, 38, 119] [Meeting with biggest business leaders this mo... [33, 4, 3, 3] (0.25, 0.34375) 0.250000 0.343750
4 2017-02-25 13:19:18 47204 The media has not reported that the National D... realDonaldTrump [4, 90, 129, 45] [Thank you Brian Krzanich, CEO of @Intel. A gr... [29, 2, 2, 2] (0.11481481481481481, 0.3185185185185185) 0.114815 0.318519
#Summarize some of the measures
df.groupby('user').describe()
polarity retweets subjectivity
user
Google count 200.000000 200.000000 200.000000
mean 0.156052 2.790000 0.237317
std 0.210424 18.629039 0.270683
min -0.500000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.100000
75% 0.250000 0.000000 0.500000
max 0.700000 183.000000 1.000000
nntaleb count 200.000000 200.000000 200.000000
mean 0.042648 83.040000 0.250281
std 0.229232 206.919931 0.299460
min -0.600000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 8.000000 0.125000
75% 0.087500 86.500000 0.459091
max 1.000000 1803.000000 1.000000
realDonaldTrump count 200.000000 200.000000 200.000000
mean 0.073978 25473.435000 0.517667
std 0.417676 13898.988554 0.292210
min -1.000000 4827.000000 0.000000
25% -0.102083 16159.250000 0.348437
50% 0.056250 22657.500000 0.500000
75% 0.325595 30177.750000 0.750000
max 1.000000 83458.000000 1.000000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment