import tweepy
import requests
import pandas as pd
import json

#Twitter API credentials
consumer_key = "get"
consumer_secret = "your"
access_key = "own"
access_secret = "creds"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

def get_user_tweets(username):
    tweets = api.user_timeline(screen_name=username, count=200)
    return tweets

trump = get_user_tweets("realDonaldTrump")
taleb = get_user_tweets("nntaleb")
google = get_user_tweets("google")

all_tweets = trump + google + taleb

# Quickly examine the data returned
# text, favorite_count, created_at, user.screen_name, retweet_count
trump[0]._json

{'contributors': None,
 'coordinates': None,
 'created_at': 'Sat Feb 25 22:02:22 +0000 2017',
 'entities': {'hashtags': [], 'symbols': [], 'urls': [], 'user_mentions': []},
 'favorite_count': 63532,
 'favorited': False,
 'geo': None,
 'id': 835610917568200705,
 'id_str': '835610917568200705',
 'in_reply_to_screen_name': None,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'is_quote_status': False,
 'lang': 'en',
 'place': None,
 'retweet_count': 16432,
 'retweeted': False,
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'text': 'Congratulations to Thomas Perez, who has just been named Chairman of the DNC. I could not be happier for him, or for the Republican Party!',
 'truncated': False,
 'user': {'contributors_enabled': False,
  'created_at': 'Wed Mar 18 13:46:38 +0000 2009',
  'default_profile': False,
  'default_profile_image': False,
  'description': '45th President of the United States of America',
  'entities': {'description': {'urls': []}},
  'favourites_count': 45,
  'follow_request_sent': False,
  'followers_count': 25550879,
  'following': False,
  'friends_count': 43,
  'geo_enabled': True,
  'has_extended_profile': False,
  'id': 25073877,
  'id_str': '25073877',
  'is_translation_enabled': True,
  'is_translator': False,
  'lang': 'en',
  'listed_count': 65153,
  'location': 'Washington, DC',
  'name': 'Donald J. Trump',
  'notifications': False,
  'profile_background_color': '6D5C18',
  'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg',
  'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg',
  'profile_background_tile': True,
  'profile_banner_url': 'https://pbs.twimg.com/profile_banners/25073877/1485301108',
  'profile_image_url': 'http://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg',
  'profile_link_color': '0D5B73',
  'profile_sidebar_border_color': 'BDDCAD',
  'profile_sidebar_fill_color': 'C5CEC0',
  'profile_text_color': '333333',
  'profile_use_background_image': True,
  'protected': False,
  'screen_name': 'realDonaldTrump',
  'statuses_count': 34539,
  'time_zone': 'Eastern Time (US & Canada)',
  'translator_type': 'regular',
  'url': None,
  'utc_offset': -18000,
  'verified': True}}

for_df = []

for t in all_tweets:
    for_df.append({
            "user": t.user.screen_name, 
            "text": t.text,
            "created_at": t.created_at,
            "retweets": t.retweet_count
        })
    
df = pd.DataFrame.from_records(for_df)
df.head(10)

	created_at	retweets	text	user
0	2017-02-25 22:02:22	16432	Congratulations to Thomas Perez, who has just ...	realDonaldTrump
1	2017-02-25 21:53:21	14973	I will not be attending the White House Corres...	realDonaldTrump
2	2017-02-25 18:12:25	6836	Weekly Address from @WhiteHouse: https://t.co/...	realDonaldTrump
3	2017-02-25 13:27:04	15695	Great optimism for future of U.S. business, AN...	realDonaldTrump
4	2017-02-25 13:19:18	47204	The media has not reported that the National D...	realDonaldTrump
5	2017-02-25 12:25:24	27019	Maybe the millions of people who voted to MAKE...	realDonaldTrump
6	2017-02-25 03:09:18	23441	FAKE NEWS media knowingly doesn't tell the tru...	realDonaldTrump
7	2017-02-24 17:04:19	17354	Trump vows to fight 'epidemic' of human traffi...	realDonaldTrump
8	2017-02-24 13:49:27	8493	Going to CPAC!	realDonaldTrump
9	2017-02-24 12:36:34	20371	find the leakers within the FBI itself. Classi...	realDonaldTrump

df.shape

(600, 4)

# Check tweets by user, some useful pandas filtering techniques
print(len(df[df['user'] == 'nntaleb']['text'].unique())) #confirm all are unique
df.groupby('user').count()

	created_at	retweets	text
user
Google	200	200	200
nntaleb	200	200	200
realDonaldTrump	200	200	200

Which tweets are most similar?

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from collections import defaultdict


documents = df['text'].tolist() #get all tweets into a list

# Since tweets are so short, not too concerned about inv doc freq normalization schemes
# Just want to find tweets that share similar words, 
# Since tweets are short, bigram range helps expand vocabulary, which ends up haing 6400 words appx

# Aside: why would a bigram vectorizer make a diff if the unigrams are counted once anyway..?
#   .....well, you get 3 votes, one for each unigram and one for the bigram....so, yeah

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(documents)
X.shape

(600, 6404)

# How to access the vocabulary (handy to know)
vocab = dict(zip(vectorizer.get_feature_names(), X))

# Check some of the longer bigrams....
sorted(vocab.items(), key=lambda x: len(x[0]), reverse=True)[:5]

[('assalehamer ctheofilopoulos',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 3 stored elements in Compressed Sparse Row format>),
 ('a_epiphanes4 ryansroberts',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 23 stored elements in Compressed Sparse Row format>),
 ('8z4rob3sf5 dominikleusder',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 23 stored elements in Compressed Sparse Row format>),
 ('americans overwhelmingly',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 17 stored elements in Compressed Sparse Row format>),
 ('artandapostasy franklin',
  <1x6404 sparse matrix of type '<class 'numpy.int64'>'
  	with 9 stored elements in Compressed Sparse Row format>)]

# better illustrate the overall concept - take 5 tweets and limit features to 20 terms

sample = df['text'].tolist()[:5]
vect = CountVectorizer(min_df=0., max_df=1.0, max_features=20)
Z = vect.fit_transform(sample)
# Original concept https://gist.github.com/larsmans/3745866
print(pd.DataFrame(Z.A, columns=vect.get_feature_names()).to_string())

print("--------------------------------------------------")
print(sample[0])
# sample 0 does not have "11th", "and", but contains "be", has "for" 2x, etc...
# note relationship to the upper "matrix" which has a row for each docc and a column for each term
# the i,jth value if the frequency of that term in the document

   11th  and  be  billion  co  first  for  great  has  https  in  national  not  obama  of  optimism  or  party  perez  the
0     0    0   1        0   0      0    2      0    1      0   0         0    1      0   1         0   1      1      1    2
1     0    1   1        0   0      0    0      1    0      0   0         0    1      0   0         0   0      0      0    1
2     0    0   0        0   2      0    0      0    0      2   0         0    0      0   0         0   0      0      0    0
3     1    1   0        0   0      0    1      1    0      0   0         0    0      0   1         1   0      0      0    1
4     0    0   0        2   0      2    0      0    1      0   2         1    1      1   0         0   0      0      0    2
--------------------------------------------------
Congratulations to Thomas Perez, who has just been named Chairman of the DNC. I could not be happier for him, or for the Republican Party!

# For each of the 600 tweets, we want to find the few most relevant tweets
# Default dict which keys off the index of the dataframe (effectively)
# X[x] is the xth entry in the count vectorized matrix X

store = defaultdict(list)

for x in range(600):
    store[x].extend(linear_kernel(X[x], X).flatten().argsort()[:-5:-1])
    
# Most common elements are the element indices themselves
print (store[0])
print (store[1])
print (store[2])

[0, 19, 141, 258]
[1, 68, 178, 24]
[2, 18, 411, 55]

Linear Kernel, WTF?

Fear not....below we explore what it all means.
Here's the gist: you have your document row, with a number (0 or N) depending on if the tweet (document) had the term (column index) in it. Great, so you take that row as a vector (1 x nbr of terms (columns)) and you multiple it by the large overall matrix.
In order to make the matrix multiplication work you need to transpose the large matrix so you end up with a multiplication result that's the size of the nbr of tweets.
Each entry in the resulting multiply is a similarity measure of the document in question and all other documents.
Below we review the idea with the linear_kernel from sklearn, or just using numpy linear algebra operations

# Let's scope out the 33rd tweet and find some similar tweets - here use the linear_kernel function from sklearn
print(linear_kernel(X[33], X).flatten().argsort()[:-5:-1])

# Here's the same thing, but we use numpy to dot the 33rd row of X against the transpose of X. 
# Keep in mind X is just the Count Vectorized Matrix
np.dot(X[33], X.T).toarray().flatten().argsort()[:-5:-1]

[ 33  51 423  88]





array([ 33,  51, 423,  88], dtype=int64)

# Let's look at the 33rd, 51st, 423rd and 88th tweets
# They all seem to have "Trump" in Common
# They all have a twitter short link, this could evolve into a stopword 
# in a "serious analysis"

for idx in [33, 51, 423, 88]:
    print(idx, df.iloc[idx]['text'])

33 'Trump signs bill undoing Obama coal mining rule' https://t.co/yMfT5r5RGh
51 'Remarks by President Trump at Signing of H.J. Resolution 41'
https://t.co/Q3MoCGAc54 https://t.co/yGDDTKm9Br
423 RT @normonics: The Minority Rule. cc @nntaleb https://t.co/RMCNdH8LMG
88 'Majority in Leading EU Nations Support Trump-Style Travel Ban' 
Poll of more than 10,000 people in 10 countries...https://t.co/KWsIWhtC9o

Argsort, negative list slicing??

OK, let's step through it from scratch using the 33rd tweet again

# Keep going with the 33rd tweeet
np.dot(X[33], X.T).toarray().flatten()[:50]
# Notice that there's a score of 17 in the middle'ish...could that be the 33rd position?

array([ 0,  0,  2,  0,  1,  0,  0,  2,  0,  0,  0,  0,  0,  2,  0,  0,  0,
        1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0, 17,
        0,  0,  0,  0,  0,  1,  1,  1,  2,  1,  0,  2,  0,  0,  0,  0], dtype=int64)

np.dot(X[33], X.T).toarray().flatten()[33] 
# Aha! Yes, wait, when you dot the thing with itse own row you get 
# a score proportional to the length of the item

Inverse Transform, a method on the vectorizer that provides non-zero entries the tweet had.....it's also the score above

vectorizer.inverse_transform(X)[33].shape # INTERESTING!!!!!

(17,)

# Back to argsort - note the very last elements
# This is providing you the indices of the values based on their rank
# for instance, the highest scores are the very last ones (note how 33 is last, the value behind it is 17)
np.dot(X[33], X.T).toarray().flatten().argsort()

array([  0, 355, 354, 353, 352, 351, 349, 348, 347, 346, 344, 343, 341,
       357, 340, 338, 337, 335, 334, 332, 331, 330, 328, 327, 326, 325,
       324, 339, 358, 359, 360, 397, 395, 393, 392, 391, 390, 389, 388,
       387, 386, 385, 384, 383, 382, 380, 379, 378, 375, 371, 370, 368,
       367, 366, 365, 364, 362, 361, 323, 399, 322, 320, 280, 279, 278,
       277, 276, 275, 274, 272, 271, 268, 266, 265, 281, 264, 261, 260,
       259, 258, 257, 255, 254, 253, 251, 250, 249, 247, 263, 282, 283,
       284, 318, 317, 316, 315, 313, 312, 311, 310, 309, 308, 307, 306,
       305, 304, 303, 300, 298, 296, 295, 294, 292, 291, 289, 288, 287,
       286, 285, 321, 246, 400, 402, 533, 532, 530, 529, 526, 525, 524,
       523, 522, 520, 519, 518, 534, 517, 513, 511, 509, 504, 503, 502,
       501, 500, 499, 498, 496, 495, 515, 535, 539, 541, 596, 594, 592,
       590, 587, 583, 580, 578, 577, 576, 575, 574, 572, 571, 570, 568,
       567, 566, 565, 564, 560, 557, 555, 554, 551, 543, 542, 493, 401,
       492, 487, 440, 439, 438, 437, 436, 435, 434, 433, 432, 431, 428,
       427, 441, 426, 422, 417, 415, 414, 412, 410, 408, 407, 406, 405,
       404, 403, 424, 442, 444, 445, 485, 481, 480, 479, 478, 477, 476,
       475, 472, 471, 469, 468, 467, 466, 463, 462, 461, 459, 458, 457,
       456, 453, 451, 450, 449, 448, 447, 491, 244, 299, 242, 140, 139,
       137, 136, 135,  44, 133, 132, 131,  38, 130, 127,  46, 125, 124,
        47, 123, 122, 121, 120, 129, 119,  37,  36,  29,  30, 160, 159,
        31, 158,  32, 156, 155, 141, 154,  34, 150, 149, 148,  35, 147,
       146, 145, 144, 153, 165, 118, 116,  91,  60,  61,  62,  85,  84,
        83,  82,  81,  59,  63,  64,  78,  65,  68,  69,  75,  74,  73,
        72,  79, 117,  93,  96, 115, 114, 113,  48, 112, 111, 110, 109,
       108,  95,  49, 107, 106,  53, 105, 104, 103, 101, 100,  97, 243,
       166, 152,  27, 215,  10, 213, 212, 211, 210,  11,  12, 167, 206,
       205, 204,  14, 203, 202,  15,  16,  19,  20,   9,  21, 217, 219,
         1, 241, 239, 238, 237,   3, 236, 235, 232, 231, 229,   5, 227,
       225, 224,   6, 222,   8, 220, 218, 196, 207, 194, 184,  24, 183,
       182, 181, 180,  23, 179, 195, 174, 172, 171,  26, 169,  25,  22,
        70, 191, 193, 189, 192, 190, 483, 545, 484, 516, 588, 573, 482,
       488, 489, 490, 586, 548, 547, 486, 589,   4, 591, 563, 544, 536,
       537,  66, 538, 562, 593, 595, 540, 474, 585, 549, 584,  39, 559,
       558, 556, 527, 528, 508,  40, 507, 521, 506, 510, 505,  41,  50,
        58, 569, 512, 552, 473, 579,  17,  43, 497,  56, 581, 582,  57,
       494, 531, 550,  52,  54, 421, 470, 198, 197, 188, 314, 187, 186,
       185, 319, 178, 177, 176, 302, 175, 173, 333, 170, 336, 168, 342,
       164, 163, 345, 161, 350, 329, 301, 199, 200, 245, 240, 248, 252,
       234, 256, 233, 230, 262, 228, 226, 267, 269, 270, 223, 273, 221,
       216, 214, 209, 208, 290, 293, 201, 297, 157, 356, 162, 363, 418,
       419, 420, 151, 425, 429, 430, 102,  99,  98,  94, 443,  90, 446,
        86, 452, 455,  80, 460,  77,  76, 464, 465, 416, 413, 599, 409,
       143, 369, 142, 372, 373, 374, 376, 377, 381, 134, 128, 138, 126,
       398, 396, 394, 597,   2,  67, 561,  71, 553, 454,  55,  87,  45,
        28,  89,   7,  18, 411,  92,  42, 598, 546, 514,  13,  88, 423,
        51,  33], dtype=int64)

# Let's do some list slicing fun
# ::-1 reverses the list and :5 provides the first 5 elements of the reverse sort!
# Hopefully that made some sense!
np.dot(X[33], X.T).toarray().flatten().argsort()[::-1][:4]

array([ 33,  51, 423,  88], dtype=int64)

A semi-pythonic way to get the similar tweets ready to be incorporated back into the dataframe

rel_tweets = defaultdict(list)

for k,v in store.items():
    for tweet in v:
        rel_tweets[k].append(df.iloc[tweet]['text'])


# Check out some related tweets        
print("\n".join(rel_tweets[0])) # "named" is in common, "like" is in common......

print("--------------------------------------------------")

# use a set to find intersection of terms
print("\n".join(rel_tweets[9])) #"given" appears twice, "classified information" in 2 tweets, etc...

Congratulations to Thomas Perez, who has just been named Chairman of the DNC. I could not be happier for him, or for the Republican Party!
Just named General H.R. McMaster National Security Advisor.
Nancy Pelosi and Fake Tears Chuck Schumer held a rally  at the steps of The Supreme Court and mic did not work (a mess)-just like Dem party!
@goochthegreat Hi there. We'd like to help. Just to confirm, are you still able to sign into your account? Let us know.
--------------------------------------------------
find the leakers within the FBI itself. Classified information is being given to media that could have a devastating effect on U.S. FIND NOW
The real scandal here is that classified information is illegally given out by "intelligence" like candy. Very un-American!
Information is being illegally given to the failing @nytimes &amp; @washingtonpost by the intelligence community (NSA and FBI?).Just like Russia
The FBI is totally unable to stop the national security "leakers" that have permeated our government for a long time. They can't even......

# I do not feel like joining it back in however via a Series - let's just use the index of the dataframe
# sincee we know the X matrix is ordered row wise identically to the dataframe

# Maybe a map function will help us here (use a lambda z)
# You can call index on any dataframe with an index
df['rel_tweets'] = df.index.map(lambda z: np.dot(X[z], X.T).toarray().flatten().argsort()[::-1][:4])
df.head()

	created_at	retweets	text	user	rel_tweets
0	2017-02-25 22:02:22	16432	Congratulations to Thomas Perez, who has just ...	realDonaldTrump	[0, 19, 141, 258]
1	2017-02-25 21:53:21	14973	I will not be attending the White House Corres...	realDonaldTrump	[1, 68, 178, 24]
2	2017-02-25 18:12:25	6836	Weekly Address from @WhiteHouse: https://t.co/...	realDonaldTrump	[2, 18, 411, 55]
3	2017-02-25 13:27:04	15695	Great optimism for future of U.S. business, AN...	realDonaldTrump	[3, 124, 38, 119]
4	2017-02-25 13:19:18	47204	The media has not reported that the National D...	realDonaldTrump	[4, 90, 129, 45]

The nice thing about dataframes is that rel_tweets is actually a list.

you can perform the same operations on it as you would a list without intermediate conversions (string to split) or other operations
now we can lookup the tweet text based on the tweet indices!
we'll use a function called "get_rel_tweets" which will take a row/column from the dataframe and look-up the tweets by their index and return them in a list comprehension, for handy use down the line

def get_rel_tweets(row):
    vals = row[1:] #ignore the first entry which is itself
    return [df.iloc[x]['text'] for x in vals]

df['rel_tweet_text'] = df['rel_tweets'].map(get_rel_tweets)

df.head()

	created_at	retweets	text	user	rel_tweets	rel_tweet_text
0	2017-02-25 22:02:22	16432	Congratulations to Thomas Perez, who has just ...	realDonaldTrump	[0, 19, 141, 258]	[Just named General H.R. McMaster National Sec...
1	2017-02-25 21:53:21	14973	I will not be attending the White House Corres...	realDonaldTrump	[1, 68, 178, 24]	[A working dinner tonight with Prime Minister ...
2	2017-02-25 18:12:25	6836	Weekly Address from @WhiteHouse: https://t.co/...	realDonaldTrump	[2, 18, 411, 55]	[Congratulations to our new National Security ...
3	2017-02-25 13:27:04	15695	Great optimism for future of U.S. business, AN...	realDonaldTrump	[3, 124, 38, 119]	[Meeting with biggest business leaders this mo...
4	2017-02-25 13:19:18	47204	The media has not reported that the National D...	realDonaldTrump	[4, 90, 129, 45]	[Thank you Brian Krzanich, CEO of @Intel. A gr...

# Let's spotcheck one of them
df.tail()

	created_at	retweets	text	user	rel_tweets	rel_tweet_text
595	2017-02-09 19:11:08	2	@CutTheKnotMath Voila. Could not find a clean ...	nntaleb	[595, 573, 470, 418]	[@CutTheKnotMath Voila. https://t.co/xUNcf6Rna...
596	2017-02-09 16:18:28	0	@CutTheKnotMath Would a solution via calculus ...	nntaleb	[596, 421, 502, 503]	[We can visualize the solution obtained via ca...
597	2017-02-09 13:01:42	85	2) Hence low risks of violence/ unrest. In spi...	nntaleb	[597, 553, 51, 474]	[We think alike. From Silent Risk (this was 20...
598	2017-02-09 12:34:14	85	1)Cond. Prob.\nConditional on having voted Tru...	nntaleb	[598, 458, 77, 405]	[@CutTheKnotMath @ctrzcinka \n2nd route: Rearr...
599	2017-02-09 02:42:23	251	Indeed. https://t.co/cp67FVtk28	nntaleb	[599, 55, 42, 51]	[Congratulations Treasury Secretary Steven Mnu...

print(df.iloc[595]['text'])
print(df.iloc[595]['rel_tweet_text'])
# Here, cuttheknotmath is driving the text similarity

@CutTheKnotMath Voila. Could not find a clean inequality. https://t.co/F0UwcEIEVj
['@CutTheKnotMath Voila. https://t.co/xUNcf6RnaI', 'The smell of mathematical inequality on Sunday evening. @CutTheKnotMath https://t.co/pKbrczGMII', "Interesting discussion around the speculation of link between lifespan hearbeats/breathing and Jensen's inequality.… https://t.co/qqRJpEmbfA"]

Wait, what about the scores? How related are each of the tweets?

Let's go back to our dot product with the similarity scores , say for the 33rd tweet

np.dot(X[33], X.T).toarray().flatten().argsort()[::-1][:4]

array([ 33,  51, 423,  88], dtype=int64)

# How we want the values of np.dot(X[33], X.T).toarray().flatten() as a data structure
# but at the above indices....it's pretty easy
# the below kinda looks crazy, but that's because I didn't use any variable names
np.dot(X[33], X.T).toarray().flatten()[[np.dot(X[33], X.T).toarray().flatten().argsort()[::-1][:4]]]

array([17,  3,  2,  2], dtype=int64)

# So above we have the best score, then the other 2 scores? How can we get this into the dataframe now?
df['scores'] = df.index.map(lambda j: 
                            np.dot(X[j], X.T).toarray().flatten()[[np.dot(X[j], X.T).toarray().flatten().argsort()[::-1][:4]]])

df.head(10)

	created_at	retweets	text	user	rel_tweets	rel_tweet_text	scores
0	2017-02-25 22:02:22	16432	Congratulations to Thomas Perez, who has just ...	realDonaldTrump	[0, 19, 141, 258]	[Just named General H.R. McMaster National Sec...	[19, 3, 2, 1]
1	2017-02-25 21:53:21	14973	I will not be attending the White House Corres...	realDonaldTrump	[1, 68, 178, 24]	[A working dinner tonight with Prime Minister ...	[19, 4, 3, 3]
2	2017-02-25 18:12:25	6836	Weekly Address from @WhiteHouse: https://t.co/...	realDonaldTrump	[2, 18, 411, 55]	[Congratulations to our new National Security ...	[15, 4, 4, 4]
3	2017-02-25 13:27:04	15695	Great optimism for future of U.S. business, AN...	realDonaldTrump	[3, 124, 38, 119]	[Meeting with biggest business leaders this mo...	[33, 4, 3, 3]
4	2017-02-25 13:19:18	47204	The media has not reported that the National D...	realDonaldTrump	[4, 90, 129, 45]	[Thank you Brian Krzanich, CEO of @Intel. A gr...	[29, 2, 2, 2]
5	2017-02-25 12:25:24	27019	Maybe the millions of people who voted to MAKE...	realDonaldTrump	[5, 123, 115, 20]	[Professional anarchists, thugs and paid prote...	[17, 11, 5, 5]
6	2017-02-25 03:09:18	23441	FAKE NEWS media knowingly doesn't tell the tru...	realDonaldTrump	[6, 27, 49, 63]	[The FAKE NEWS media (failing @nytimes, @NBCNe...	[31, 9, 7, 7]
7	2017-02-24 17:04:19	17354	Trump vows to fight 'epidemic' of human traffi...	realDonaldTrump	[7, 51, 87, 514]	['Remarks by President Trump at Signing of H.J...	[15, 3, 2, 2]
8	2017-02-24 13:49:27	8493	Going to CPAC!	realDonaldTrump	[8, 376, 183, 236]	[@chuurpy Hi there. By chance, did you already...	[3, 1, 1, 1]
9	2017-02-24 12:36:34	20371	find the leakers within the FBI itself. Classi...	realDonaldTrump	[9, 44, 47, 10]	[The real scandal here is that classified info...	[15, 4, 3, 2]

from textblob import TextBlob

# Get the sentiment for each tweet
df['sentiment'] = df['text'].map(lambda x: TextBlob(x).sentiment)
df['polarity'] = df['sentiment'].map(lambda x: x[0])
df['subjectivity'] = df['sentiment'].map(lambda x: x[1])
df.head()

	created_at	retweets	text	user	rel_tweets	rel_tweet_text	scores	sentiment	polarity	subjectivity
0	2017-02-25 22:02:22	16432	Congratulations to Thomas Perez, who has just ...	realDonaldTrump	[0, 19, 141, 258]	[Just named General H.R. McMaster National Sec...	[19, 3, 2, 1]	(0.0, 0.0)	0.000000	0.000000
1	2017-02-25 21:53:21	14973	I will not be attending the White House Corres...	realDonaldTrump	[1, 68, 178, 24]	[A working dinner tonight with Prime Minister ...	[19, 4, 3, 3]	(0.5, 0.375)	0.500000	0.375000
2	2017-02-25 18:12:25	6836	Weekly Address from @WhiteHouse: https://t.co/...	realDonaldTrump	[2, 18, 411, 55]	[Congratulations to our new National Security ...	[15, 4, 4, 4]	(0.0, 0.0)	0.000000	0.000000
3	2017-02-25 13:27:04	15695	Great optimism for future of U.S. business, AN...	realDonaldTrump	[3, 124, 38, 119]	[Meeting with biggest business leaders this mo...	[33, 4, 3, 3]	(0.25, 0.34375)	0.250000	0.343750
4	2017-02-25 13:19:18	47204	The media has not reported that the National D...	realDonaldTrump	[4, 90, 129, 45]	[Thank you Brian Krzanich, CEO of @Intel. A gr...	[29, 2, 2, 2]	(0.11481481481481481, 0.3185185185185185)	0.114815	0.318519

#Summarize some of the measures
df.groupby('user').describe()

		polarity	retweets	subjectivity
user
Google	count	200.000000	200.000000	200.000000
	mean	0.156052	2.790000	0.237317
	std	0.210424	18.629039	0.270683
	min	-0.500000	0.000000	0.000000
	25%	0.000000	0.000000	0.000000
	50%	0.000000	0.000000	0.100000
	75%	0.250000	0.000000	0.500000
	max	0.700000	183.000000	1.000000
nntaleb	count	200.000000	200.000000	200.000000
	mean	0.042648	83.040000	0.250281
	std	0.229232	206.919931	0.299460
	min	-0.600000	0.000000	0.000000
	25%	0.000000	0.000000	0.000000
	50%	0.000000	8.000000	0.125000
	75%	0.087500	86.500000	0.459091
	max	1.000000	1803.000000	1.000000
realDonaldTrump	count	200.000000	200.000000	200.000000
	mean	0.073978	25473.435000	0.517667
	std	0.417676	13898.988554	0.292210
	min	-1.000000	4827.000000	0.000000
	25%	-0.102083	16159.250000	0.348437
	50%	0.056250	22657.500000	0.500000
	75%	0.325595	30177.750000	0.750000
	max	1.000000	83458.000000	1.000000

jrjames83/nat_lang_twtr.md

Which tweets are most similar?

Linear Kernel, WTF?

Argsort, negative list slicing??

Inverse Transform, a method on the vectorizer that provides non-zero entries the tweet had.....it's also the score above

A semi-pythonic way to get the similar tweets ready to be incorporated back into the dataframe

The nice thing about dataframes is that rel_tweets is actually a list.

Wait, what about the scores? How related are each of the tweets?