Last active
December 1, 2015 18:03
-
-
Save odedlaz/684e02aef699667b4545 to your computer and use it in GitHub Desktop.
download all mathmaticians from enealogy.math.ndsu.nodak.edu to a csv file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
from pyquery import PyQuery | |
import csv | |
# the format for the id | |
url_format = url_format = "http://genealogy.math.ndsu.nodak.edu/id.php?id=%d" | |
# the response message when there is no id | |
# I hope there are no spaces in between, that is: | |
# id 3 exists, id 4 doesn't but id 5 does exist. | |
# if that's the case, it's going to be alittle bit more trickey | |
# talk to me if that's the case | |
missing_response = "You have specified an ID that does not exist in the database" | |
# start with the first id | |
person_id = 1 | |
# setting up the csv writer | |
# csv is a format that excel knows how to open. really easy! | |
# we write during scraping, not after - to save memory | |
# change this path to whatever you like | |
csvfile = open('/tmp/names.csv', 'w') | |
print("writing all data to: %s" % csvfile.name) | |
# write the columns | |
writer = csv.DictWriter(csvfile, fieldnames=['person', 'pupil']) | |
writer.writeheader() | |
# change this to while person_id < 10 to check it's working | |
# btw, it's working ;) | |
while True: | |
print("trying to fetch details for id: %d" % person_id) | |
# using requests to download the page | |
response = requests.get(url_format % person_id) | |
# unfortunately, when the person is missing | |
# they still respond with return code 200 instead of 404 | |
if missing_response in response.text: | |
print("we're done! id %d doesn't exist" % person_id) | |
break | |
# use PyQuery to select the elements we want from the page | |
q = PyQuery(response.text) | |
# get person name. this is the h2 header. | |
# you can find it by inspecting the source of the html | |
person_name = q("h2").text().strip() | |
print("found a new person! his name is: %s" % person_name) | |
# this is a shorthand. we find all the pupils, | |
# transform the names to text and save it | |
# I looked up in stackoverflow how to get the first child of every td | |
# tr is the row, td is the cell. we need the first cell of every row. | |
pupils = map(lambda x: PyQuery(x).text().strip(), q("table tr td:first-child")) | |
print("person_name: %s | pupils: %s" % (person_name, pupils)) | |
# if there are no pupils -> don't write anything. | |
# if the are, add them to the file. | |
for pupil in pupils: | |
writer.writerow({'person': person_name, 'pupil': pupil}) | |
# increment the id. not using a foreach because we don't know when to stop... | |
person_id += 1 | |
# flush the items in memory to the file every 10 people | |
# so if we crash, only maximum of 10 values are lost | |
if person_id % 10: | |
csvfile.flush() | |
# remember to close the file at the end! | |
csvfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment