-
-
Save vgoklani/ef457dba0c55677f32ceec0646c5ccfe to your computer and use it in GitHub Desktop.
Python Utilities for Tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
import string | |
from nltk.stem.lancaster import LancasterStemmer | |
from nltk.corpus import stopwords | |
#Gets the tweet time. | |
def get_time(tweet): | |
return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y") | |
#Gets all hashtags. | |
def get_hashtags(tweet): | |
return [tag['text'] for tag in tweet['entities']['hashtags']] | |
#Gets the screen names of any user mentions. | |
def get_user_mentions(tweet): | |
return [m['screen_name'] for m in tweet['entities']['user_mentions']] | |
#Gets the text, sans links, hashtags, mentions, media, and symbols. | |
def get_text_cleaned(tweet): | |
text = tweet['text'] | |
slices = [] | |
#Strip out the urls. | |
if 'urls' in tweet['entities']: | |
for url in tweet['entities']['urls']: | |
slices += [{'start': url['indices'][0], 'stop': url['indices'][1]}] | |
#Strip out the hashtags. | |
if 'hashtags' in tweet['entities']: | |
for tag in tweet['entities']['hashtags']: | |
slices += [{'start': tag['indices'][0], 'stop': tag['indices'][1]}] | |
#Strip out the user mentions. | |
if 'user_mentions' in tweet['entities']: | |
for men in tweet['entities']['user_mentions']: | |
slices += [{'start': men['indices'][0], 'stop': men['indices'][1]}] | |
#Strip out the media. | |
if 'media' in tweet['entities']: | |
for med in tweet['entities']['media']: | |
slices += [{'start': med['indices'][0], 'stop': med['indices'][1]}] | |
#Strip out the symbols. | |
if 'symbols' in tweet['entities']: | |
for sym in tweet['entities']['symbols']: | |
slices += [{'start': sym['indices'][0], 'stop': sym['indices'][1]}] | |
# Sort the slices from highest start to lowest. | |
slices = sorted(slices, key=lambda x: -x['start']) | |
#No offsets, since we're sorted from highest to lowest. | |
for s in slices: | |
text = text[:s['start']] + text[s['stop']:] | |
return text | |
#Sanitizes the text by removing front and end punctuation, | |
#making words lower case, and removing any empty strings. | |
def get_text_sanitized(tweet): | |
return ' '.join([w.lower().strip().rstrip(string.punctuation)\ | |
.lstrip(string.punctuation).strip()\ | |
for w in get_text_cleaned(tweet).split()\ | |
if w.strip().rstrip(string.punctuation).strip()]) | |
#Gets the text, clean it, make it lower case, stem the words, and split | |
#into a vector. Also, remove stop words. | |
def get_text_normalized(tweet): | |
#Sanitize the text first. | |
text = get_text_sanitized(tweet).split() | |
#Remove the stop words. | |
text = [t for t in text if t not in stopwords.words('english')] | |
#Create the stemmer. | |
stemmer = LancasterStemmer() | |
#Stem the words. | |
return [stemmer.stem(t) for t in text] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment