Skip to content

Instantly share code, notes, and snippets.

@danhammer
Created September 25, 2014 17:39
Show Gist options
  • Save danhammer/109a3789bc811d29abab to your computer and use it in GitHub Desktop.
Save danhammer/109a3789bc811d29abab to your computer and use it in GitHub Desktop.
scrape spaceapps
from bs4 import BeautifulSoup
import requests
import urllib
import pandas
import json
import itertools
import os
BASE = "https://2014.spaceappschallenge.org/awards/#globalnom"
GOOGLE_GEOCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json'
ENCODING = 'utf8'
def encoded_dict(in_dict):
"""Encode the values of a dictionary as given in ENCODING.
Taken from http://stackoverflow.com/a/6481120/699026"""
out_dict = {}
for k, v in in_dict.iteritems():
if isinstance(v, unicode):
v = v.encode(ENCODING)
elif isinstance(v, str):
# Must be encoded in UTF-8
v.decode(ENCODING)
out_dict[k] = v
return out_dict
def _get_request(params, base_url):
"""Accepts a dictionary of parameters that are passed to the browser,
along with the base URL set at the top of the script, depending on the
service."""
encoded = encoded_dict(params)
payload = urllib.urlencode(encoded)
r = urllib.urlopen("%s?%s" % (base_url, payload))
return json.loads(r.read().strip('[]'))
def google_geocode(address, api_key=os.environ('GOOGLE_KEY'):
"""Accepts an address string and returns the (latitude, longitude)
coordinates of the address, as given by the Google geocoder."""
d = dict(address=address, key=api_key)
gc = _get_request(d, GOOGLE_GEOCODE_URL)
if gc['status'] != 'OK':
raise Exception('Geocoding error for address: %s\n\n%s' % (address,
gc))
else:
formatted_address = gc['results'][0]['formatted_address']
lat = gc['results'][0]['geometry']['location']['lat']
lon = gc['results'][0]['geometry']['location']['lng']
return dict(address=formatted_address, lat=lat, lon=lon)
def challenge(soup):
"""Accepts the soup object of the project specific website and returns the
challenge that it addresses."""
bold_list = soup.find_all('strong')
def _project_text(x):
return 'This project is solving' in x.text
[project] = filter(_project_text, bold_list)
return project.a.text.encode(ENCODING)
def people(soup):
uls = soup.find_all('ul')
def _people_container(ul):
g = ul.get('class')
if g is not None:
return 'team' in set(g)
else:
return False
[team] = filter(_people_container, uls)
imgs = team.find_all('img')
print [i['alt'] for i in imgs]
return [i['alt'] for i in imgs]
def project_data(relative_url):
url = 'https://2014.spaceappschallenge.org' + relative_url
print url
r = requests.get(url)
soup = BeautifulSoup(r.text)
return {
'people': people(soup),
'challenge': challenge(soup)
}
def project_attrs(projects):
def _update(d):
attr = project_data(d['url'])
return d.update(attr)
return [_update(x) for x in projects]
def process_entry(entry):
place = entry.h2.text
geocoded = google_geocode(place)
def _project(li):
return encoded_dict({
'url': li.a['href'],
'name': li.a.text,
'category': li.br.text
})
projects = [_project(x) for x in entry.find_all('li')]
d = encoded_dict({
'place': place,
'lat': geocoded['lat'],
'lon': geocoded['lon'],
'projects': project_attrs(projects)
})
def _flatten_dict(processed):
base = {k: processed[k] for k in ['place', 'lat', 'lon']}
return [dict(base.items() + d.items()) for d in processed['projects']]
return _flatten_dict(d)
def process_winners(idx=5):
"""Process and save the winners of the 2014 """
r = requests.get(BASE)
soup = BeautifulSoup(r.text)
global_nominations = soup.findAll('ul')[idx]
cities = global_nominations.find_all("li", recursive=False)
list2d = [process_entry(e) for e in cities]
merged = list(itertools.chain(*list2d))
df = pandas.DataFrame(merged)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment