Created
September 25, 2014 17:39
-
-
Save danhammer/109a3789bc811d29abab to your computer and use it in GitHub Desktop.
scrape spaceapps
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import urllib | |
import pandas | |
import json | |
import itertools | |
import os | |
BASE = "https://2014.spaceappschallenge.org/awards/#globalnom" | |
GOOGLE_GEOCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json' | |
ENCODING = 'utf8' | |
def encoded_dict(in_dict): | |
"""Encode the values of a dictionary as given in ENCODING. | |
Taken from http://stackoverflow.com/a/6481120/699026""" | |
out_dict = {} | |
for k, v in in_dict.iteritems(): | |
if isinstance(v, unicode): | |
v = v.encode(ENCODING) | |
elif isinstance(v, str): | |
# Must be encoded in UTF-8 | |
v.decode(ENCODING) | |
out_dict[k] = v | |
return out_dict | |
def _get_request(params, base_url): | |
"""Accepts a dictionary of parameters that are passed to the browser, | |
along with the base URL set at the top of the script, depending on the | |
service.""" | |
encoded = encoded_dict(params) | |
payload = urllib.urlencode(encoded) | |
r = urllib.urlopen("%s?%s" % (base_url, payload)) | |
return json.loads(r.read().strip('[]')) | |
def google_geocode(address, api_key=os.environ('GOOGLE_KEY'): | |
"""Accepts an address string and returns the (latitude, longitude) | |
coordinates of the address, as given by the Google geocoder.""" | |
d = dict(address=address, key=api_key) | |
gc = _get_request(d, GOOGLE_GEOCODE_URL) | |
if gc['status'] != 'OK': | |
raise Exception('Geocoding error for address: %s\n\n%s' % (address, | |
gc)) | |
else: | |
formatted_address = gc['results'][0]['formatted_address'] | |
lat = gc['results'][0]['geometry']['location']['lat'] | |
lon = gc['results'][0]['geometry']['location']['lng'] | |
return dict(address=formatted_address, lat=lat, lon=lon) | |
def challenge(soup): | |
"""Accepts the soup object of the project specific website and returns the | |
challenge that it addresses.""" | |
bold_list = soup.find_all('strong') | |
def _project_text(x): | |
return 'This project is solving' in x.text | |
[project] = filter(_project_text, bold_list) | |
return project.a.text.encode(ENCODING) | |
def people(soup): | |
uls = soup.find_all('ul') | |
def _people_container(ul): | |
g = ul.get('class') | |
if g is not None: | |
return 'team' in set(g) | |
else: | |
return False | |
[team] = filter(_people_container, uls) | |
imgs = team.find_all('img') | |
print [i['alt'] for i in imgs] | |
return [i['alt'] for i in imgs] | |
def project_data(relative_url): | |
url = 'https://2014.spaceappschallenge.org' + relative_url | |
print url | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text) | |
return { | |
'people': people(soup), | |
'challenge': challenge(soup) | |
} | |
def project_attrs(projects): | |
def _update(d): | |
attr = project_data(d['url']) | |
return d.update(attr) | |
return [_update(x) for x in projects] | |
def process_entry(entry): | |
place = entry.h2.text | |
geocoded = google_geocode(place) | |
def _project(li): | |
return encoded_dict({ | |
'url': li.a['href'], | |
'name': li.a.text, | |
'category': li.br.text | |
}) | |
projects = [_project(x) for x in entry.find_all('li')] | |
d = encoded_dict({ | |
'place': place, | |
'lat': geocoded['lat'], | |
'lon': geocoded['lon'], | |
'projects': project_attrs(projects) | |
}) | |
def _flatten_dict(processed): | |
base = {k: processed[k] for k in ['place', 'lat', 'lon']} | |
return [dict(base.items() + d.items()) for d in processed['projects']] | |
return _flatten_dict(d) | |
def process_winners(idx=5): | |
"""Process and save the winners of the 2014 """ | |
r = requests.get(BASE) | |
soup = BeautifulSoup(r.text) | |
global_nominations = soup.findAll('ul')[idx] | |
cities = global_nominations.find_all("li", recursive=False) | |
list2d = [process_entry(e) for e in cities] | |
merged = list(itertools.chain(*list2d)) | |
df = pandas.DataFrame(merged) | |
return df | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment