Created
February 10, 2016 14:04
-
-
Save jcchurch/f9cfb8c01c168e926540 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import urllib.request | |
import urllib.parse | |
def downloadPage(domain, path): | |
filehandle = urllib.request.urlopen(domain + path) | |
page = filehandle.readlines() | |
return page | |
def getEntry(page, i): | |
return page[i].decode("utf-8").replace("</td>","").strip() | |
def getOneRecords(page, i): | |
record = [] | |
while True: | |
for tag in ["views-field-institution-1", "views-field-lastname", "views-field-firstname", "views-field-jobtitle", "views-field-department", "views-field-php", "views-field-fte"]: | |
if tag in str(page[i]): | |
record.append(getEntry(page, i+1)) | |
if "views-field-fte" in str(page[i]): | |
break | |
i += 2 | |
return record | |
def getRecords(page): | |
records = [] | |
for i in range(len(page)): | |
if "views-field-institution-1" in str(page[i]): | |
records.append(getOneRecords(page, i)) | |
records.pop(0) | |
return records | |
def getPathForNextPage(page): | |
url = "" | |
for line in page: | |
if "pager-next" in str(line): | |
m = re.search("\/hr\/[^\"]*", line.decode("utf-8")) | |
url = m.group().replace("&","&") | |
return url | |
def downloadTBR(domain, path): | |
records = [] | |
while path != "": | |
page = downloadPage(domain, path) | |
records += getRecords(page) | |
path = getPathForNextPage(page) | |
return records | |
def formatRecords(records): | |
for record in records: | |
print("|".join(record)) | |
if __name__=='__main__': | |
domain = """https://www.tbr.edu""" | |
path = """/hr/salaries?firstname=&lastname=&department=&jobtitle=&institution=&page=""" | |
records = downloadTBR(domain, path) | |
formatRecords(records) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment