swinton · October 27, 2010 22:29
diff --git a/autoparkin.py b/autoparkin.py
 #!/usr/bin/env python

 import urllib
 import anyjson
 import nltk
 import random
 import twitter_text
 import re

 def generate_model(cfdist, word, word_count=10):
    for i in range(word_count):
        print word,
        word=cfdist[word].max()

 def autoparkin(screen_name="SimonParkin", word_count=10):
    u = 'http://api.twitter.com/1/statuses/user_timeline.json?screen_name=%s&count=200&trim_user=true' % screen_name
    f=urllib.urlopen(u)
    d=f.read()
    f.close()
    
    tweets=anyjson.deserialize(d)
    pattern = r'''(?x)    # set flag to allow verbose regexps
         ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
       | \$?\w+([-']\w+)*  # words with optional internal hyphens
       | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
       | \.\.\.            # ellipsis
       | \:\(              # :(
       | \:\)              # :) | [!?]              # these are separate tokens
    '''
    tokens=[]
    
    for tweet in tweets:
        entities=[]
        
        tweet_text=tweet['text']
        extractor=twitter_text.Extractor(tweet_text)
        
        hashtags=extractor.extract_hashtags()
        entities.extend(['#' + hashtag for hashtag in hashtags])
                    
        screen_names=extractor.extract_mentioned_screen_names()
        entities.extend(['@' + screen_name for screen_name in screen_names])
        
        urls=extractor.extract_urls()
        entities.extend([url for url in urls])
        
        entity_map=[("$_ent_%d" % num, entities[num]) for num in range(len(entities))]
        
        for id,entity in entity_map:
            tweet_text=tweet_text.replace(entity, id)
            
        tweet_tokens=nltk.regexp_tokenize(tweet_text, pattern)
        entity_map=dict(entity_map)
        for num in range(len(tweet_tokens)):
            if tweet_tokens[num] in entity_map:
                tweet_tokens[num] = entity_map[tweet_tokens[num]]
        
        tokens.extend(tweet_tokens)
        
    text=nltk.Text(tokens)
    # De-dup tokens to derive the vocab
    vocab=list(set(tokens))
    cfd=nltk.ConditionalFreqDist(nltk.bigrams(text))
    return generate_model(cfd, vocab[random.randrange(0, len(vocab))], word_count=word_count)

 if __name__ == "__main__":
    print str(autoparkin())
	#!/usr/bin/env python

	import urllib
	import anyjson
	import nltk
	import random
	import twitter_text
	import re

	def generate_model(cfdist, word, word_count=10):
	for i in range(word_count):
	print word,
	word=cfdist[word].max()

	def autoparkin(screen_name="SimonParkin", word_count=10):
	u = 'http://api.twitter.com/1/statuses/user_timeline.json?screen_name=%s&count=200&trim_user=true' % screen_name
	f=urllib.urlopen(u)
	d=f.read()
	f.close()

	tweets=anyjson.deserialize(d)
	pattern = r'''(?x) # set flag to allow verbose regexps
	([A-Z]\.)+ # abbreviations, e.g. U.S.A.
	\| \$?\w+([-']\w+)* # words with optional internal hyphens
	\| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
	\| \.\.\. # ellipsis
	\| \:\( # :(
	\| \:\) # :) \| [!?] # these are separate tokens
	'''
	tokens=[]

	for tweet in tweets:
	entities=[]

	tweet_text=tweet['text']
	extractor=twitter_text.Extractor(tweet_text)

	hashtags=extractor.extract_hashtags()
	entities.extend(['#' + hashtag for hashtag in hashtags])

	screen_names=extractor.extract_mentioned_screen_names()
	entities.extend(['@' + screen_name for screen_name in screen_names])

	urls=extractor.extract_urls()
	entities.extend([url for url in urls])

	entity_map=[("$_ent_%d" % num, entities[num]) for num in range(len(entities))]

	for id,entity in entity_map:
	tweet_text=tweet_text.replace(entity, id)

	tweet_tokens=nltk.regexp_tokenize(tweet_text, pattern)
	entity_map=dict(entity_map)
	for num in range(len(tweet_tokens)):
	if tweet_tokens[num] in entity_map:
	tweet_tokens[num] = entity_map[tweet_tokens[num]]

	tokens.extend(tweet_tokens)

	text=nltk.Text(tokens)
	# De-dup tokens to derive the vocab
	vocab=list(set(tokens))
	cfd=nltk.ConditionalFreqDist(nltk.bigrams(text))
	return generate_model(cfd, vocab[random.randrange(0, len(vocab))], word_count=word_count)

	if __name__ == "__main__":
	print str(autoparkin())
No results found