Created
October 22, 2011 16:30
-
-
Save rizumu/1306175 to your computer and use it in GitHub Desktop.
university_locator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
from urlparse import urlparse | |
import re | |
import urllib2 | |
if __name__ == "__main__": | |
""" driver code """ | |
# extract html | |
url = 'http://www.utexas.edu/world/univ/alpha/' | |
usock = urllib2.urlopen(url) | |
html = usock.read() | |
usock.close() | |
# clean up the html | |
soup = BeautifulSoup(html) | |
# grab all list items from the page | |
list_items = [str(embed) for embed in soup.findAll('li')] | |
colleges = [] | |
url_pattern = re.compile(r'http\://[a-zA-Z0-9-\.]*') | |
name_pattern = re.compile(r'"\>[a-zA-Z-\.&/\ ]*') | |
state_pattern = re.compile(r'\([A-Z/]*') | |
# extra data from list items | |
for i in list_items: | |
url = re.search(url_pattern, i).group() | |
name = re.search(name_pattern, i).group() | |
# neccessary if statement because some items do not have state data | |
if re.search(state_pattern, i): | |
state = re.search(state_pattern, i).group() | |
else: | |
state = '' | |
# strip exteranous characters that were used in regex search | |
college = (url, name.strip('">'), state.strip('(')) | |
colleges.append(college) | |
# cleanly print to screen the contents the colleges variable | |
for college in colleges: | |
print college | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment