Created
October 27, 2010 22:29
-
-
Save swinton/650163 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import urllib | |
| import anyjson | |
| import nltk | |
| import random | |
| import twitter_text | |
| import re | |
| def generate_model(cfdist, word, word_count=10): | |
| for i in range(word_count): | |
| print word, | |
| word=cfdist[word].max() | |
| def autoparkin(screen_name="SimonParkin", word_count=10): | |
| u = 'http://api.twitter.com/1/statuses/user_timeline.json?screen_name=%s&count=200&trim_user=true' % screen_name | |
| f=urllib.urlopen(u) | |
| d=f.read() | |
| f.close() | |
| tweets=anyjson.deserialize(d) | |
| pattern = r'''(?x) # set flag to allow verbose regexps | |
| ([A-Z]\.)+ # abbreviations, e.g. U.S.A. | |
| | \$?\w+([-']\w+)* # words with optional internal hyphens | |
| | \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | |
| | \.\.\. # ellipsis | |
| | \:\( # :( | |
| | \:\) # :) | [!?] # these are separate tokens | |
| ''' | |
| tokens=[] | |
| for tweet in tweets: | |
| entities=[] | |
| tweet_text=tweet['text'] | |
| extractor=twitter_text.Extractor(tweet_text) | |
| hashtags=extractor.extract_hashtags() | |
| entities.extend(['#' + hashtag for hashtag in hashtags]) | |
| screen_names=extractor.extract_mentioned_screen_names() | |
| entities.extend(['@' + screen_name for screen_name in screen_names]) | |
| urls=extractor.extract_urls() | |
| entities.extend([url for url in urls]) | |
| entity_map=[("$_ent_%d" % num, entities[num]) for num in range(len(entities))] | |
| for id,entity in entity_map: | |
| tweet_text=tweet_text.replace(entity, id) | |
| tweet_tokens=nltk.regexp_tokenize(tweet_text, pattern) | |
| entity_map=dict(entity_map) | |
| for num in range(len(tweet_tokens)): | |
| if tweet_tokens[num] in entity_map: | |
| tweet_tokens[num] = entity_map[tweet_tokens[num]] | |
| tokens.extend(tweet_tokens) | |
| text=nltk.Text(tokens) | |
| # De-dup tokens to derive the vocab | |
| vocab=list(set(tokens)) | |
| cfd=nltk.ConditionalFreqDist(nltk.bigrams(text)) | |
| return generate_model(cfd, vocab[random.randrange(0, len(vocab))], word_count=word_count) | |
| if __name__ == "__main__": | |
| print str(autoparkin()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment