Created
September 5, 2009 04:32
-
-
Save bycoffe/181287 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A small Django app I built last year that looked for people on Twitter saying they were sick. | |
An example of using Twitter search. | |
""" | |
# models.py | |
from django.contrib.gis.db import models | |
class Phrase(models.Model): | |
phrase = models.CharField(max_length=100) | |
def __unicode__(self): | |
return self.phrase | |
class TwitterUser(models.Model): | |
name = models.CharField(max_length=100) | |
username = models.CharField(max_length=100, unique=True) | |
location = models.CharField(max_length=100, blank=True) | |
pt = models.PointField(null=True) | |
def __unicode__(self): | |
return self.username | |
class Tweet(models.Model): | |
twitter_id = models.IntegerField(unique=True) | |
tweet = models.CharField(max_length=255) | |
phrase = models.ForeignKey(Phrase) | |
user = models.ForeignKey(TwitterUser) | |
published = models.DateTimeField() | |
is_active = models.BooleanField(default=True) | |
def __unicode__(self): | |
return self.tweet | |
# get_tweets.py | |
import datetime | |
import os | |
import sys | |
import urllib | |
import urllib2 | |
from django.core.management import setup_environ | |
from django.contrib.gis.geos import * | |
from django.http import HttpRequest | |
from BeautifulSoup import BeautifulSoup | |
from dateutil.parser import parse as dateparse | |
from geopy import geocoders | |
URL = "http://search.twitter.com/search.atom?%s" | |
g = geocoders.Google('') | |
def phrase_urls(): | |
from sickontwitter.models import Phrase | |
for phrase in Phrase.objects.all(): | |
p = '"%s"' % phrase.phrase | |
qs = urllib.urlencode({'q': p}) | |
yield (phrase, URL % qs) | |
def url_to_soup(url): | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page.read()) | |
page.close() | |
return soup | |
def parse_xml(soup, phrase): | |
entries = soup.findAll('entry') | |
for entry in entries: | |
parse_entry(entry, phrase) | |
def fix_tags(string): | |
"""Strip bold tags | |
""" | |
string = string.replace('<b>', '').replace('</b>', '') | |
string = string.replace('<', '<').replace('>', '>') | |
string = string.replace('&', '&'); | |
return string | |
def parse_entry(entry, phrase): | |
from sickontwitter.models import Tweet | |
published = dateparse(entry.find('published').contents[0]) | |
url = entry.find('link')['href'] | |
twitter_id = url.split('/')[-1] | |
message = fix_tags(entry.find('content').contents[0]) | |
author_url = entry.find('author').find('uri').contents[0] | |
author = get_author(author_url) | |
tweet, created = Tweet.objects.get_or_create( | |
twitter_id=twitter_id, | |
defaults={'tweet': message, | |
'phrase': phrase, | |
'user': author, | |
'published': published}) | |
if created: | |
print(tweet) | |
def get_author(url): | |
from sickontwitter.models import TwitterUser | |
username = unicode(url.split('/')[-1]) | |
try: | |
return TwitterUser.objects.get(username=username) | |
except TwitterUser.DoesNotExist: | |
pass | |
soup = url_to_soup(url) | |
name, location = parse_author_page(soup) | |
pt = get_pt(location) | |
author = TwitterUser.objects.create( | |
name=name, | |
username=username, | |
location=location, | |
pt=pt) | |
return author | |
def get_pt(location): | |
pt = Point(0, 0) | |
if not location: | |
return pt | |
try: | |
place, (lat, lng) = g.geocode(location) | |
except ValueError: | |
return pt | |
pt = Point(float(lng), float(lat)) | |
return pt | |
def parse_author_page(soup): | |
try: | |
name = soup.find('span', {'class': 'fn'}).contents[0] | |
except AttributeError: | |
name = '' | |
try: | |
location = soup.find('span', {'class': 'adr'}).contents[0] | |
except AttributeError: | |
location = '' | |
return (unicode(name), unicode(location)) | |
def _main(): | |
# Django setup | |
sys.path.append(os.getcwd()) | |
import settings | |
setup_environ(settings) | |
for phrase, url in phrase_urls(): | |
soup = url_to_soup(url) | |
parse_xml(soup, phrase) | |
if __name__ == '__main__': | |
_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment