Skip to content

Instantly share code, notes, and snippets.

@jcchurch
Created February 10, 2016 14:04
Show Gist options
  • Save jcchurch/f9cfb8c01c168e926540 to your computer and use it in GitHub Desktop.
Save jcchurch/f9cfb8c01c168e926540 to your computer and use it in GitHub Desktop.
import re
import urllib.request
import urllib.parse
def downloadPage(domain, path):
filehandle = urllib.request.urlopen(domain + path)
page = filehandle.readlines()
return page
def getEntry(page, i):
return page[i].decode("utf-8").replace("</td>","").strip()
def getOneRecords(page, i):
record = []
while True:
for tag in ["views-field-institution-1", "views-field-lastname", "views-field-firstname", "views-field-jobtitle", "views-field-department", "views-field-php", "views-field-fte"]:
if tag in str(page[i]):
record.append(getEntry(page, i+1))
if "views-field-fte" in str(page[i]):
break
i += 2
return record
def getRecords(page):
records = []
for i in range(len(page)):
if "views-field-institution-1" in str(page[i]):
records.append(getOneRecords(page, i))
records.pop(0)
return records
def getPathForNextPage(page):
url = ""
for line in page:
if "pager-next" in str(line):
m = re.search("\/hr\/[^\"]*", line.decode("utf-8"))
url = m.group().replace("&amp;","&")
return url
def downloadTBR(domain, path):
records = []
while path != "":
page = downloadPage(domain, path)
records += getRecords(page)
path = getPathForNextPage(page)
return records
def formatRecords(records):
for record in records:
print("|".join(record))
if __name__=='__main__':
domain = """https://www.tbr.edu"""
path = """/hr/salaries?firstname=&lastname=&department=&jobtitle=&institution=&page="""
records = downloadTBR(domain, path)
formatRecords(records)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment