Skip to content

Instantly share code, notes, and snippets.

@swinton
Created October 27, 2010 22:29
Show Gist options
  • Select an option

  • Save swinton/650163 to your computer and use it in GitHub Desktop.

Select an option

Save swinton/650163 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import urllib
import anyjson
import nltk
import random
import twitter_text
import re
def generate_model(cfdist, word, word_count=10):
for i in range(word_count):
print word,
word=cfdist[word].max()
def autoparkin(screen_name="SimonParkin", word_count=10):
u = 'http://api.twitter.com/1/statuses/user_timeline.json?screen_name=%s&count=200&trim_user=true' % screen_name
f=urllib.urlopen(u)
d=f.read()
f.close()
tweets=anyjson.deserialize(d)
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \$?\w+([-']\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| \:\( # :(
| \:\) # :) | [!?] # these are separate tokens
'''
tokens=[]
for tweet in tweets:
entities=[]
tweet_text=tweet['text']
extractor=twitter_text.Extractor(tweet_text)
hashtags=extractor.extract_hashtags()
entities.extend(['#' + hashtag for hashtag in hashtags])
screen_names=extractor.extract_mentioned_screen_names()
entities.extend(['@' + screen_name for screen_name in screen_names])
urls=extractor.extract_urls()
entities.extend([url for url in urls])
entity_map=[("$_ent_%d" % num, entities[num]) for num in range(len(entities))]
for id,entity in entity_map:
tweet_text=tweet_text.replace(entity, id)
tweet_tokens=nltk.regexp_tokenize(tweet_text, pattern)
entity_map=dict(entity_map)
for num in range(len(tweet_tokens)):
if tweet_tokens[num] in entity_map:
tweet_tokens[num] = entity_map[tweet_tokens[num]]
tokens.extend(tweet_tokens)
text=nltk.Text(tokens)
# De-dup tokens to derive the vocab
vocab=list(set(tokens))
cfd=nltk.ConditionalFreqDist(nltk.bigrams(text))
return generate_model(cfd, vocab[random.randrange(0, len(vocab))], word_count=word_count)
if __name__ == "__main__":
print str(autoparkin())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment