Created
May 5, 2012 21:11
-
-
Save kwellman/2605610 to your computer and use it in GitHub Desktop.
Mining twitter blog post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, urllib, urllib2, json | |
BLACKLIST = ['odesk', 'elance', '#jobs', 'now hiring'] | |
def search_twitter(query, no_retweets=True): | |
if no_retweets: | |
# use the negation operator to filter out retweets | |
query += ' -RT' | |
url = 'http://search.twitter.com/search.json?%s' % urllib.urlencode({ | |
'q': query, | |
'lang': 'en', # restrict results to english tweets | |
'rpp': 100, # return 100 results per page (maximum value) | |
}) | |
response = json.loads(urllib2.urlopen(url).read()) | |
return response['results'] | |
def init_queries(): | |
phrases = [ | |
'wish there was', | |
'why isn\'t there', | |
'wish someone would create', | |
'somebody needs to create', | |
'somebody should create', | |
'someone needs to create', | |
'someone should create', | |
] | |
# add type of product (eg. app, site, or website) to end of phrases | |
return ['%s %s' % (p, suffix) for p in phrases for suffix in ['app', 'site', 'website']] | |
def make_regex(query): | |
"""Returns a compiled regex. Use like so regex.search(tweet)""" | |
s = r'(\s\S+){0,2}\s'.join(query.split()) | |
return re.compile(s, re.IGNORECASE) | |
def blacklist(text): | |
# reject if tweet starts with quotes or contains blacklisted word | |
return text[0] in ['"', unichr(8220)] or any(bad in text for bad in BLACKLIST) | |
if __name__ == '__main__': | |
import sys | |
queries = init_queries() | |
try: | |
index = int(sys.argv[1]) | |
except (ValueError, IndexError): | |
print 'Usage: %s <int>' % sys.argv[0] | |
for i, query in enumerate(queries): | |
print '%s: %s' % (i, query) | |
sys.exit() | |
query = queries[index % len(queries)] | |
results = search_twitter(query) | |
regex = make_regex(query) | |
for tweet in results: | |
if regex.search(tweet['text']) and not blacklist(tweet['text']): | |
print tweet['text'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment