Created
February 15, 2016 05:27
-
-
Save mjcreativeventures/63db5fe6e70b3e468ad3 to your computer and use it in GitHub Desktop.
Supervised Classification of Tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import random | |
import re | |
STATIONS = [ | |
'Admiralty MRT', | |
'Aljunied MRT', | |
'Ang Mo Kio MRT', | |
'Bartley MRT', | |
'Bayfront MRT', | |
'Bedok MRT', | |
'Bishan MRT', | |
'Bras Basah MRT', | |
'Botanic Gardens MRT', | |
'Braddell MRT', | |
'Bukit Batok MRT', | |
'Bukit Gombak MRT', | |
'Caldecott MRT', | |
'Choa Chu Kang MRT', | |
'Boon Keng MRT', | |
'Boon Lay MRT', | |
'Buangkok MRT', | |
'Bugis MRT', | |
'Buona Vista MRT', | |
'Changi Airport MRT', | |
'Chinatown MRT', | |
'Clarke Quay MRT', | |
'Chinese Garden MRT', | |
'City Hall MRT', | |
'Clementi MRT', | |
'Commonwealth MRT', | |
'Dakota MRT', | |
'Dhoby Ghaut MRT', | |
'Dover MRT', | |
'Esplanade MRT', | |
'Eunos MRT', | |
'Expo MRT', | |
'Farrer Park MRT', | |
'Farrer Road MRT', | |
'HarbourFront MRT', | |
'Haw Par Villa MRT', | |
'Holland Village MRT', | |
'Hougang MRT', | |
'Joo Koon MRT', | |
'Jurong East MRT', | |
'Kallang MRT', | |
'Kovan MRT', | |
'Kembangan MRT', | |
'Kent Ridge MRT', | |
'Khatib MRT', | |
'Kranji MRT', | |
'Lakeside MRT', | |
'Labrador Park MRT', | |
'Lavender MRT', | |
'Little India MRT', | |
'Lorong Chuan MRT', | |
'Marina Bay MRT', | |
'Marsiling MRT', | |
'MacPherson MRT', | |
'Marymount MRT', | |
'Mountbatten MRT', | |
'Newton MRT', | |
'Nicoll Highway MRT', | |
'one-north MRT', | |
'Novena MRT', | |
'Orchard MRT', | |
'Outram Park MRT', | |
'Pasir Ris MRT', | |
'Pasir Panjang MRT', | |
'Paya Lebar MRT', | |
'Pioneer MRT', | |
'Potong Pasir MRT', | |
'Promenade MRT', | |
'Punggol MRT', | |
'Queenstown MRT', | |
'Raffles Place MRT', | |
'Redhill MRT', | |
'Sembawang MRT', | |
'Sengkang MRT', | |
'Serangoon MRT', | |
'Simei MRT', | |
'Somerset MRT', | |
'Stadium MRT', | |
'Tampines MRT', | |
'Tai Seng MRT', | |
'Tanah Merah MRT', | |
'Tanjong Pagar MRT', | |
'Tiong Bahru MRT', | |
'Telok Blangah MRT', | |
'Toa Payoh MRT', | |
'Woodlands MRT', | |
'Woodleigh MRT', | |
'Yew Tree MRT', | |
'Yio Chu Kang MRT', | |
'Yishun MRT' | |
] | |
# regular expressions used to clean up the tweet data | |
mrt_station_re = re.compile('|'.join(STATIONS).lower()) | |
http_re = re.compile(r'\s+http://[^\s]*') | |
remove_ellipsis_re = re.compile(r'\.\.\.') | |
at_sign_re = re.compile(r'\@\S+') | |
punct_re = re.compile(r"[\"'\[\],.:;()\-&!]") | |
price_re = re.compile(r"\d+\.\d\d") | |
number_re = re.compile(r"\d+") | |
# converts to lower case and clean up the text | |
def normalize_tweet(tweet): | |
t = tweet.lower() | |
t = re.sub(price_re, 'PRICE', t) | |
t = re.sub(remove_ellipsis_re, '', t) | |
t = re.sub(mrt_station_re, 'MRT_STATION', t) | |
t = re.sub(http_re, ' LINK', t) | |
t = re.sub(punct_re, '', t) | |
t = re.sub(at_sign_re, '@', t) | |
t = re.sub(number_re, 'NUM', t) | |
return t | |
def tweet_features(tweet_data): | |
features = {} | |
tweet = normalize_tweet(tweet_data['tweet']) | |
for bigrams in nltk.bigrams(tweet.split(' ')): | |
features['contains(%s)' % ','.join(bigrams)] = True | |
return features | |
# reads three lines of text from a file | |
read3lines = lambda x: [ x.readline().strip(), x.readline().strip(), x.readline() ] | |
data = [] | |
with open('labelled_tweets.data') as f: | |
tweet, label, newline = read3lines(f) | |
while len(tweet) > 0: | |
data.append({ 'tweet': tweet, 'label': label }) | |
tweet, label, newline = read3lines(f) | |
#random.shuffle(data) | |
# we split the data into two parts | |
# the first part (90% of the data) is for training | |
# the remaining 10% of the data is for testing | |
size = int(len(data) * 0.9) | |
train_data = data[:size] | |
test_data = data[size:] | |
# generate features for tweet | |
train_set = [ (tweet_features(d), d['label']) for d in train_data ] | |
test_set = [ (tweet_features(d), d['label']) for d in test_data ] | |
# pick a classifier | |
classifier = nltk.NaiveBayesClassifier | |
# train classifier using training set | |
classifier = nltk.NaiveBayesClassifier.train(train_set) | |
classifier.show_most_informative_features(20) | |
# collect tweets that were wrongly classified | |
errors = [] | |
for d in test_data: | |
label = d['label'] | |
guess = classifier.classify(tweet_features(d)) | |
if guess != label: | |
errors.append( (label, guess, d) ) | |
for (label, guess, d) in sorted(errors): | |
print 'correct label: %s\nguessed label: %s\ntweet=%s\n' % (label, guess, d['tweet']) | |
print 'Total errors: %d' % len(errors) | |
print 'Accuracy: ', nltk.classify.accuracy(classifier, test_set) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment