Last active
January 14, 2016 18:06
-
-
Save rolph-recto/5921554 to your computer and use it in GitHub Desktop.
Featureset builder for tweets
Used for classification and other machine learning tasks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# contractions.py | |
# list from: http://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions | |
contractions = [ | |
("aren't", "are not"), | |
("can't", "cannot"), | |
("can't've", "cannot have"), | |
("'cause", "because"), | |
("could've", "could have"), | |
("couldn't", "could not"), | |
("couldn't've", "could not have"), | |
("didn't", "did not"), | |
("doesn't", "does not"), | |
("don't", "do not"), | |
("hadn't", "had not"), | |
("hadn't've", "had not have"), | |
("hasn't", "has not"), | |
("haven't", "have not"), | |
("he'd", "he had"), | |
("he'd've", "he would have"), | |
("he'll", "he will"), | |
("he'll've", "he will have"), | |
("he's", "he is"), | |
("how'd", "how did"), | |
("how'd'y", "how did you"), | |
("how'll", "how will"), | |
("how's", "how is"), | |
("i'd", "i had"), | |
("i'd've", "i would have"), | |
("i'll", "i will"), | |
("i'll've", "i will have"), | |
("i'm", "i am"), | |
("i've", "i have"), | |
("isn't", "is not"), | |
("it'd", "it had"), | |
("it'd've", "it would have"), | |
("it'll", "it will"), | |
("it'll've", "it will have"), | |
("it's", "it is"), | |
("let's", "let us"), | |
("ma'am", "madam"), | |
("might've", "might have"), | |
("mightn't", "might not"), | |
("mightn't've", "might not have"), | |
("must've", "must have"), | |
("mustn't", "must not"), | |
("mustn't've", "must not have"), | |
("needn't", "need not"), | |
("o'clock", "of the clock"), | |
("oughtn't", "ought not"), | |
("oughtn't've", "ought not have"), | |
("shan't", "shall not"), | |
("shan't've", "shall not have"), | |
("she'd", "she had"), | |
("she'd've", "she would have"), | |
("she'll", "she will"), | |
("she'll've", "she will have"), | |
("she's", "she is"), | |
("should've", "should have"), | |
("shouldn't", "should not"), | |
("shouldn't've", "should not have"), | |
("so's", "so is"), | |
("that's", "that is"), | |
("there'd", "there would"), | |
("there's", "there is"), | |
("they'd", "they would"), | |
("they'll", "they will"), | |
("they'll've", "they will have"), | |
("they're", "they are"), | |
("they've", "they have"), | |
("to've", "to have"), | |
("wasn't", "was not"), | |
("we'd", "we would"), | |
("we'll", "we will"), | |
("we'll've", "we will have"), | |
("we're", "we are"), | |
("we've", "we have"), | |
("weren't", "were not"), | |
("what'll", "what will"), | |
("what'll've", "what will have"), | |
("what're", "what are"), | |
("what's", "what is"), | |
("what've", "what have"), | |
("when's", "when is"), | |
("when've", "when have"), | |
("where'd", "where did"), | |
("where's", "where is"), | |
("where've", "where have"), | |
("who'll", "who will"), | |
("who'll've", "who will have"), | |
("who's", "who is"), | |
("who've", "who have"), | |
("why's", "why is"), | |
("will've", "will have"), | |
("won't", "will not"), | |
("won't've", "will not have"), | |
("would've", "would have"), | |
("wouldn't", "would not"), | |
("wouldn't've", "would not have"), | |
("y'all", "you all"), | |
("y'all'd've", "you all would have"), | |
("y'all're", "you all are"), | |
("y'all've", "you all have"), | |
("you'd", "you would"), | |
("you'd've", "you would have"), | |
("you'll", "you will"), | |
("you'll've", "you will have"), | |
("you're", "you are"), | |
("you've", "you have") | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# stopwords.py | |
# because the NLTK stopwords corpus is not enough | |
# from http://www.ranks.nl/resources/stopwords.html | |
stopwords = [ | |
"a", | |
"able", | |
"about", | |
"above", | |
"abst", | |
"accordance", | |
"according", | |
"accordingly", | |
"across", | |
"act", | |
"actually", | |
"added", | |
"adj", | |
"affected", | |
"affecting", | |
"affects", | |
"after", | |
"afterwards", | |
"again", | |
"against", | |
"ah", | |
"all", | |
"almost", | |
"alone", | |
"along", | |
"already", | |
"also", | |
"although", | |
"always", | |
"am", | |
"among", | |
"amongst", | |
"an", | |
"and", | |
"announce", | |
"another", | |
"any", | |
"anybody", | |
"anyhow", | |
"anymore", | |
"anyone", | |
"anything", | |
"anyway", | |
"anyways", | |
"anywhere", | |
"apparently", | |
"approximately", | |
"are", | |
"aren", | |
"arent", | |
"arise", | |
"around", | |
"as", | |
"aside", | |
"ask", | |
"asking", | |
"at", | |
"auth", | |
"available", | |
"away", | |
"awfully", | |
"b", | |
"back", | |
"be", | |
"became", | |
"because", | |
"become", | |
"becomes", | |
"becoming", | |
"been", | |
"before", | |
"beforehand", | |
"begin", | |
"beginning", | |
"beginnings", | |
"begins", | |
"behind", | |
"being", | |
"believe", | |
"below", | |
"beside", | |
"besides", | |
"between", | |
"beyond", | |
"biol", | |
"both", | |
"brief", | |
"briefly", | |
"but", | |
"by", | |
"c", | |
"ca", | |
"came", | |
"can", | |
"cannot", | |
"can't", | |
"cause", | |
"causes", | |
"certain", | |
"certainly", | |
"co", | |
"com", | |
"come", | |
"comes", | |
"contain", | |
"containing", | |
"contains", | |
"could", | |
"couldnt", | |
"d", | |
"date", | |
"did", | |
"didn't", | |
"different", | |
"do", | |
"does", | |
"doesn't", | |
"doing", | |
"done", | |
"don't", | |
"down", | |
"downwards", | |
"due", | |
"during", | |
"e", | |
"each", | |
"ed", | |
"edu", | |
"effect", | |
"eg", | |
"eight", | |
"eighty", | |
"either", | |
"else", | |
"elsewhere", | |
"end", | |
"ending", | |
"enough", | |
"especially", | |
"et", | |
"et-al", | |
"etc", | |
"even", | |
"ever", | |
"every", | |
"everybody", | |
"everyone", | |
"everything", | |
"everywhere", | |
"ex", | |
"except", | |
"f", | |
"far", | |
"few", | |
"ff", | |
"fifth", | |
"first", | |
"five", | |
"fix", | |
"followed", | |
"following", | |
"follows", | |
"for", | |
"former", | |
"formerly", | |
"forth", | |
"found", | |
"four", | |
"from", | |
"further", | |
"furthermore", | |
"g", | |
"gave", | |
"get", | |
"gets", | |
"getting", | |
"give", | |
"given", | |
"gives", | |
"giving", | |
"go", | |
"goes", | |
"gone", | |
"got", | |
"gotten", | |
"h", | |
"had", | |
"happens", | |
"hardly", | |
"has", | |
"hasn't", | |
"have", | |
"haven't", | |
"having", | |
"he", | |
"hed", | |
"hence", | |
"her", | |
"here", | |
"hereafter", | |
"hereby", | |
"herein", | |
"heres", | |
"hereupon", | |
"hers", | |
"herself", | |
"hes", | |
"hi", | |
"hid", | |
"him", | |
"himself", | |
"his", | |
"hither", | |
"home", | |
"how", | |
"howbeit", | |
"however", | |
"hundred", | |
"i", | |
"id", | |
"ie", | |
"if", | |
"i'll", | |
"im", | |
"immediate", | |
"immediately", | |
"importance", | |
"important", | |
"in", | |
"inc", | |
"indeed", | |
"index", | |
"information", | |
"instead", | |
"into", | |
"invention", | |
"inward", | |
"is", | |
"isn't", | |
"it", | |
"itd", | |
"it'll", | |
"its", | |
"itself", | |
"i've", | |
"j", | |
"just", | |
"k", | |
"keep", | |
"keeps", | |
"kept", | |
"kg", | |
"km", | |
"know", | |
"known", | |
"knows", | |
"l", | |
"largely", | |
"last", | |
"lately", | |
"later", | |
"latter", | |
"latterly", | |
"least", | |
"less", | |
"lest", | |
"let", | |
"lets", | |
"like", | |
"liked", | |
"likely", | |
"line", | |
"little", | |
"'ll", | |
"look", | |
"looking", | |
"looks", | |
"ltd", | |
"m", | |
"made", | |
"mainly", | |
"make", | |
"makes", | |
"many", | |
"may", | |
"maybe", | |
"me", | |
"mean", | |
"means", | |
"meantime", | |
"meanwhile", | |
"merely", | |
"mg", | |
"might", | |
"million", | |
"miss", | |
"ml", | |
"more", | |
"moreover", | |
"most", | |
"mostly", | |
"mr", | |
"mrs", | |
"much", | |
"mug", | |
"must", | |
"my", | |
"myself", | |
"n", | |
"na", | |
"name", | |
"namely", | |
"nay", | |
"nd", | |
"near", | |
"nearly", | |
"necessarily", | |
"necessary", | |
"need", | |
"needs", | |
"neither", | |
"never", | |
"nevertheless", | |
"new", | |
"next", | |
"nine", | |
"ninety", | |
"no", | |
"nobody", | |
"non", | |
"none", | |
"nonetheless", | |
"noone", | |
"nor", | |
"normally", | |
"nos", | |
"not", | |
"noted", | |
"nothing", | |
"now", | |
"nowhere", | |
"o", | |
"obtain", | |
"obtained", | |
"obviously", | |
"of", | |
"off", | |
"often", | |
"oh", | |
"ok", | |
"okay", | |
"old", | |
"omitted", | |
"on", | |
"once", | |
"one", | |
"ones", | |
"only", | |
"onto", | |
"or", | |
"ord", | |
"other", | |
"others", | |
"otherwise", | |
"ought", | |
"our", | |
"ours", | |
"ourselves", | |
"out", | |
"outside", | |
"over", | |
"overall", | |
"owing", | |
"own", | |
"p", | |
"page", | |
"pages", | |
"part", | |
"particular", | |
"particularly", | |
"past", | |
"per", | |
"perhaps", | |
"placed", | |
"please", | |
"plus", | |
"poorly", | |
"possible", | |
"possibly", | |
"potentially", | |
"pp", | |
"predominantly", | |
"present", | |
"previously", | |
"primarily", | |
"probably", | |
"promptly", | |
"proud", | |
"provides", | |
"put", | |
"q", | |
"que", | |
"quickly", | |
"quite", | |
"qv", | |
"r", | |
"ran", | |
"rather", | |
"rd", | |
"re", | |
"readily", | |
"really", | |
"recent", | |
"recently", | |
"ref", | |
"refs", | |
"regarding", | |
"regardless", | |
"regards", | |
"related", | |
"relatively", | |
"research", | |
"respectively", | |
"resulted", | |
"resulting", | |
"results", | |
"right", | |
"run", | |
"s", | |
"said", | |
"same", | |
"saw", | |
"say", | |
"saying", | |
"says", | |
"sec", | |
"section", | |
"see", | |
"seeing", | |
"seem", | |
"seemed", | |
"seeming", | |
"seems", | |
"seen", | |
"self", | |
"selves", | |
"sent", | |
"seven", | |
"several", | |
"shall", | |
"she", | |
"shed", | |
"she'll", | |
"shes", | |
"should", | |
"shouldn't", | |
"show", | |
"showed", | |
"shown", | |
"showns", | |
"shows", | |
"significant", | |
"significantly", | |
"similar", | |
"similarly", | |
"since", | |
"six", | |
"slightly", | |
"so", | |
"some", | |
"somebody", | |
"somehow", | |
"someone", | |
"somethan", | |
"something", | |
"sometime", | |
"sometimes", | |
"somewhat", | |
"somewhere", | |
"soon", | |
"sorry", | |
"specifically", | |
"specified", | |
"specify", | |
"specifying", | |
"still", | |
"stop", | |
"strongly", | |
"sub", | |
"substantially", | |
"successfully", | |
"such", | |
"sufficiently", | |
"suggest", | |
"sup", | |
"sure", | |
"t", | |
"take", | |
"taken", | |
"taking", | |
"tell", | |
"tends", | |
"th", | |
"than", | |
"thank", | |
"thanks", | |
"thanx", | |
"that", | |
"that'll", | |
"thats", | |
"that've", | |
"the", | |
"their", | |
"theirs", | |
"them", | |
"themselves", | |
"then", | |
"thence", | |
"there", | |
"thereafter", | |
"thereby", | |
"thered", | |
"therefore", | |
"therein", | |
"there'll", | |
"thereof", | |
"therere", | |
"theres", | |
"thereto", | |
"thereupon", | |
"there've", | |
"these", | |
"they", | |
"theyd", | |
"they'll", | |
"theyre", | |
"they've", | |
"think", | |
"this", | |
"those", | |
"thou", | |
"though", | |
"thoughh", | |
"thousand", | |
"throug", | |
"through", | |
"throughout", | |
"thru", | |
"thus", | |
"til", | |
"tip", | |
"to", | |
"together", | |
"too", | |
"took", | |
"toward", | |
"towards", | |
"tried", | |
"tries", | |
"truly", | |
"try", | |
"trying", | |
"ts", | |
"twice", | |
"two", | |
"u", | |
"un", | |
"under", | |
"unfortunately", | |
"unless", | |
"unlike", | |
"unlikely", | |
"until", | |
"unto", | |
"up", | |
"upon", | |
"ups", | |
"us", | |
"use", | |
"used", | |
"useful", | |
"usefully", | |
"usefulness", | |
"uses", | |
"using", | |
"usually", | |
"v", | |
"value", | |
"various", | |
"'ve", | |
"very", | |
"via", | |
"viz", | |
"vol", | |
"vols", | |
"vs", | |
"w", | |
"want", | |
"wants", | |
"was", | |
"wasn't", | |
"way", | |
"we", | |
"wed", | |
"welcome", | |
"we'll", | |
"went", | |
"were", | |
"weren't", | |
"we've", | |
"what", | |
"whatever", | |
"what'll", | |
"whats", | |
"when", | |
"whence", | |
"whenever", | |
"where", | |
"whereafter", | |
"whereas", | |
"whereby", | |
"wherein", | |
"wheres", | |
"whereupon", | |
"wherever", | |
"whether", | |
"which", | |
"while", | |
"whim", | |
"whither", | |
"who", | |
"whod", | |
"whoever", | |
"whole", | |
"who'll", | |
"whom", | |
"whomever", | |
"whos", | |
"whose", | |
"why", | |
"widely", | |
"willing", | |
"wish", | |
"with", | |
"within", | |
"without", | |
"won't", | |
"words", | |
"world", | |
"would", | |
"wouldn't", | |
"www", | |
"x", | |
"y", | |
"yes", | |
"yet", | |
"you", | |
"youd", | |
"you'll", | |
"your", | |
"youre", | |
"yours", | |
"yourself", | |
"yourselves", | |
"you've", | |
"z", | |
"zero" | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tf_idf.py | |
# term frequency - inverse document frequency | |
# word ranker used to build featureset for classifiers | |
# for convenience, the functions below assume the following: | |
# a "document" is a list of tokens | |
# a "corpus" is a list of documents (a list of lists of tokens) | |
#allows for integer division that returns rational numbers, not rounded integers | |
from __future__ import division | |
import math | |
class IdfDict(dict): | |
""" | |
dictionary of idf values | |
""" | |
def __init__(self, corpus_size, *args, **kwargs): | |
super(IdfDict, self).__init__(*args, **kwargs) | |
self.corpus_size = corpus_size | |
def __getitem__(self, key): | |
#if a term is not in the dictionary, calculate an idf value anyway | |
if not key in self: | |
return math.log(self.corpus_size) | |
else: | |
return super(IdfDict, self).__getitem__(key) | |
def tf_raw(term, document): | |
return document.count(term) | |
def tf_bool(term, document): | |
return 1 if term in document else 0 | |
def tf_log(term, document): | |
return math.log(document.count(term)) | |
def tf(term, document, algorithm="RAW"): | |
""" | |
Calculates the frequency of a term within a document | |
Can specific different algorithms: | |
-RAW calculates the raw frequency of a term | |
-i.e., number of times it occurs in the document | |
-BOOL calculates the "boolean frequency" of a term | |
-i.e., if the term is in the document, tf is 1; if not, tf is 0 | |
-LOG calculates a logarithmically scaled term frequency | |
-i.e., tf = log of the raw frequency | |
-AUG calculates term frequency divided by the max frequency of the word | |
in the document | |
-i.e., this prevents bias towards longer documents | |
NOTE: only RAW, BOOL, and LOG are implemented at the moment | |
""" | |
if algorithm == "RAW": | |
return tf_raw(term, document) | |
elif algorithm == "BOOL": | |
return tf_bool(term, document) | |
elif algorithm == "LOG": | |
return tf_log(term, document) | |
else: | |
raise ValueError("tf cannot use algorithm %s" % algorithm) | |
def idf(term, corpus): | |
""" | |
Calculates the inverse document frequency for a term | |
idf is the log of the ratio between the total number of documents | |
in the corpus and the number of docs in the corpus with the given term | |
""" | |
corpus_size = len(corpus) | |
docs_with_term = 0 | |
for document in corpus: | |
if term in document: | |
docs_with_term += 1 | |
#add 1 to docs_with_term to account for terms that don't occur in the corpus | |
#so that a division by zero doesn't occur | |
return math.log( corpus_size / (docs_with_term+1) ) | |
def tf_idf(term, document, corpus, algorithm="RAW"): | |
""" | |
return tf-idf score | |
""" | |
#if idf score was calculated previously, don't calculate it again; | |
#this occurs when the tf-idf of a term is being calculated | |
#for all documents in a corpus | |
if type(corpus) is float: | |
return tf(term, document, algorithm) * corpus | |
else: | |
return tf(term, document, algorithm) * idf(term, corpus) | |
def idf_corpus(corpus): | |
""" | |
calculates idf score for all terms in a corpus | |
""" | |
#build idf score for all terms in the corpus | |
#first, build a vocab of the corpus | |
vocab = set() | |
for document in corpus: | |
vocab |= set(document) | |
#then, calculate the idf for each term in the vocab | |
idf_set = IdfDict(len(corpus)) | |
for term in vocab: | |
idf_set[term] = idf(term, corpus) | |
return idf_set | |
def tf_idf_corpus(corpus, algorithm="RAW", idf_set=None): | |
""" | |
calculates tf-idf score for all terms in every document of a corpus | |
""" | |
#retrieve idf scores for all words in corpus | |
if idf_set == None: | |
idf_set = idf_corpus(corpus) | |
#calculate tf-idf score for every document | |
doc_set = [] | |
for document in corpus: | |
doc_vocab = set(document) | |
tf_idf_set = {} | |
#calculate tf and then tf-idf score for every term | |
for term in doc_vocab: | |
tf_score = tf(term, document, algorithm) | |
tf_idf_set[term] = tf_idf(term, document, idf_set[term], algorithm) | |
doc_set.append(tf_idf_set) | |
return doc_set |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tweet_featureset.py | |
# build a featureset from a tweet corpus | |
# a tweet corpus is assumed to be a list of tweets, | |
# where a tweet is a dictionary with at least the following keys: | |
# -id | |
# -text | |
# -political (Boolean; answers whether a tweet is political or not) | |
from nltk.tokenize import WhitespaceTokenizer | |
from tweet_preprocess import cleanup_text, cleanup_tokens | |
from tf_idf import tf, idf_corpus, tf_idf_corpus | |
class TweetFeatureset(object): | |
""" | |
TweetFeatureset class | |
creates featuresets for tweets | |
""" | |
tokenizer = WhitespaceTokenizer() | |
def __init__(self, corpus): | |
self.train(corpus) | |
@classmethod | |
def tokenize_tweet(cls, tweet): | |
""" | |
tokenize a single tweet | |
""" | |
#cleanup tweets | |
tweet["text"] = cleanup_text(tweet["text"]) | |
#tokenize tweets using NLTK word tokenizer | |
#use the whitespace tokenizer to preserve contractions, | |
#which will be expanded during tokenization processing | |
tweet["tokens"] = cls.tokenizer.tokenize(tweet["text"]) | |
#clean up tokenized tweets | |
tweet["tokens"] = cleanup_tokens(tweet["tokens"]) | |
return tweet | |
@classmethod | |
def tokenize_corpus(cls, corpus): | |
""" | |
return a tweet corpus in tokenized form | |
""" | |
corpus = [TweetFeatureset.tokenize_tweet(tweet) for tweet in corpus] | |
#remove empty tweets from corpus | |
return [tweet for tweet in corpus if len(tweet["tokens"]) > 0] | |
def train(self, corpus): | |
""" | |
use corpus to calculate idf scores | |
""" | |
corpus = TweetFeatureset.tokenize_corpus(corpus) | |
token_corpus = [tweet["tokens"] for tweet in corpus] | |
self.idf_set = idf_corpus(token_corpus) | |
def build_tagged_featureset(self, tweets, algorithm="BOOL"): | |
""" | |
build a featureset for a classifier using a tweet corpus | |
and pair it with a tag (political/apolitical) | |
use boolean frequency algorithm for tweets because tweets are so short | |
there is no reason to count words in each tweet, | |
just detect if a word is in the tweet | |
""" | |
#pair features with respective tags | |
tagged_features = [ | |
(features, tweets[i]["political"]) | |
for i, features in enumerate(self.build_featureset(tweets, algorithm)) | |
] | |
return tagged_features | |
def build_featureset(self, tweets, algorithm="BOOL"): | |
""" | |
build a featureset with no pairing to a tag | |
""" | |
#tokenize corpus | |
corpus = TweetFeatureset.tokenize_corpus(tweets) | |
#extract features from tweet corpus | |
token_corpus = [tweet["tokens"] for tweet in tweets] | |
feature_corpus = tf_idf_corpus(token_corpus, algorithm, self.idf_set) | |
return feature_corpus |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tweet_preprocess.py | |
# preprocess tweets before using them | |
# to build a featureset for a classifier | |
import re | |
import string | |
import nltk | |
from nltk.tag import pos_tag | |
from stopwords import stopwords | |
from contractions import contractions | |
#utility functions for cleaning up | |
#i.e., preprocessing tweet text before tokenization | |
def normalize_whitespace(func): | |
""" | |
strip whitespace from the start and end of a tweet | |
also, convert multiple sequential whitespace chars into one whitespace char | |
""" | |
return lambda text: re.sub(r"[\s]{2,}", " ", func(text).strip()) | |
def remove_punctuation(func): | |
""" | |
remove punctuation from text | |
""" | |
#TO DO | |
#preserve apostrophes between letters for contractions | |
#strip possessives | |
return lambda text: re.sub(r"[`~!@#$%^&*()-=_+,./<>?;':\"\[\]{}\|]", | |
" ", func(text)) | |
def remove_possessives(func): | |
""" | |
remove possessives from a noun | |
""" | |
return lambda text: re.sub(r"[^\s]'s([\s]+|$)", | |
lambda match: (match.group(0).strip()[:-2] + " "), func(text)) | |
def convert_to_lowercase(func): | |
""" | |
convert tweet to all all_lowercase | |
""" | |
return lambda text: func(text).lower() | |
def convert_hashtags(func): | |
""" | |
convert hashtags into individual words | |
ex. #TeamJacob would be converted to team jacob | |
""" | |
return lambda text: \ | |
re.sub(r"#([^\s#]+)", | |
lambda match: \ | |
" " | |
+ re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", match.group(1)) | |
+ " ", | |
func(text)) | |
def remove_retweets(func): | |
""" | |
remove retweet tag ("RT") from tweet | |
""" | |
return lambda text: re.sub(r"([\s]+|^)RT([\s]+|$)", " ", func(text)) | |
def remove_usernames(func): | |
""" | |
remove usernames from tweet | |
""" | |
return lambda text: re.sub(r"([\s]+|^)@[^\s]+", " ", func(text)) | |
def remove_email(func): | |
""" | |
remove emails from tweet | |
""" | |
return lambda text: re.sub(r"[\s]*[^@\s]+@[^@\s]+\.[^@\s]", " ", func(text)) | |
def remove_links(func): | |
""" | |
remove hyperlinks from tweet | |
""" | |
return lambda text: re.sub(r"[\s]*(https|http|ftp)[^\s]+", " ", func(text)) | |
def convert_to_ascii(func): | |
""" | |
convert unicode to ascii by removing all non-ascii characters | |
""" | |
return lambda text: func(text).encode("ascii", "ignore") | |
@normalize_whitespace | |
@remove_punctuation | |
@remove_possessives | |
@convert_to_lowercase | |
@convert_hashtags | |
@remove_retweets | |
@remove_usernames | |
@remove_email | |
@remove_links | |
@convert_to_ascii | |
def cleanup_text(text): | |
return text | |
#preprocess tokens | |
def remove_irrelevant_pos_tokens(tokens): | |
""" | |
remove tokens of a certain part of speech that is probably irrelevant | |
to the political content of the tweet | |
assume that only nouns, verbs, and adjectives are relevant; | |
remove all other tokens | |
""" | |
pos_tokens = pos_tag(tokens) | |
return [token for token, tag in pos_tokens | |
if token.startswith("N") | |
or token.startswith("V") | |
or token.startswith("J") | |
] | |
def remove_short_tokens(tokens): | |
""" | |
remove tokens that are 2 or less characters | |
we can assume that these tokens aren't important | |
""" | |
return [token for token in tokens if len(token) > 2] | |
def remove_stopwords(tokens): | |
""" | |
remove stopwords (i.e., "scaffold words" in English w/o much meaning) | |
""" | |
global stopwords | |
return [token for token in tokens if not token in stopwords] | |
def expand_contraction_tokens(tokens): | |
""" | |
expand a list of tokens | |
""" | |
global contractions | |
result_tokens = [] | |
for token in tokens: | |
is_contraction = False | |
for contraction, expansion in contractions: | |
if token == contraction or token == contraction.replace("'", ""): | |
result_tokens += expansion.split() | |
is_contraction = True | |
break | |
if not is_contraction: | |
result_tokens.append(token) | |
return result_tokens | |
def cleanup_tokens(tokens): | |
""" | |
preprocess tokens before using them to build a featureset | |
""" | |
tokens = expand_contraction_tokens(tokens) | |
tokens = remove_stopwords(tokens) | |
tokens = remove_short_tokens(tokens) | |
return tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment