Created
February 16, 2011 14:56
-
-
Save mjbommar/829501 to your computer and use it in GitHub Desktop.
Compare speeds of NLTK and tm.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@author Michael J Bommarito II | |
@date Feb 16, 2011 | |
''' | |
import codecs | |
import dateutil.parser | |
import multiprocessing | |
import nltk | |
porter = nltk.PorterStemmer() | |
minLength = 3 | |
def readTweets(fileName): | |
''' | |
Read in the tweet data. This is just tab-delimited. | |
''' | |
rows = [[field.strip() for field in line.split("\t")] for line in codecs.open(fileName, 'r', 'utf-8')] | |
return [(int(row[0]), dateutil.parser.parse(row[1]), row[2], row[3]) for row in rows] | |
def processDoc(doc): | |
''' | |
Pre-process each document and return an nltk.Text object. | |
This is just a stub so we can call map(). | |
''' | |
return nltk.Text([porter.stem(w.lower()) for w in nltk.word_tokenize(doc) if len(w) >= minLength]) | |
def main(): | |
''' | |
Main | |
''' | |
pool = multiprocessing.Pool(8) | |
docs = [tweet[3] for tweet in readTweets("data/tweets_25bahman.csv")] | |
corpus = pool.map(processDoc, docs) | |
print corpus[0] | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment