Last active
May 17, 2016 01:48
-
-
Save tim-fan/d911f8a62dc16449fb19f09e416fe0a9 to your computer and use it in GitHub Desktop.
Quick web-scraper for Te Papa online collections. Original request: "I wanna scrape these pages: 'http://collections.tepapa.govt.nz/Object/46209'. Object 0 to 10,000."
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import json | |
def unlistIfSingle(givenList): | |
""" | |
Remove first element from given list if it is the only element in the list. | |
""" | |
if len(givenList) == 1: | |
output = givenList[0] | |
else: | |
output = givenList | |
return output | |
def parseTable(table): | |
""" | |
Parse the object page's specification table into a dictionary. | |
""" | |
if table is None: | |
return dict() | |
tableDict = {} | |
tableRows = table('tr') | |
for row in tableRows: | |
rowItems = row('td') | |
key, value = (rowItems[0].string, list(rowItems[1].stripped_strings)) | |
value = unlistIfSingle(value) | |
tableDict[key] = value | |
return tableDict | |
def parseRelatedInfo(relatedInfoDiv): | |
""" | |
Parse the object page's related-info section into a dictionary. | |
""" | |
if relatedInfoDiv is None: | |
return dict() | |
relatedInfoDict = {} | |
relatedInfo = relatedInfoDiv.find('div', class_="webpart").find_all((['strong', 'a'])) | |
#related info should consist of 'strong' tags followed by one or more 'a' tags | |
for element in relatedInfo: | |
elementVal = element.string | |
if element.name == 'strong': | |
#current element is a param name | |
currentParam = elementVal | |
relatedInfoDict[currentParam] = [] | |
else: | |
#current element is a param value | |
relatedInfoDict[currentParam].append(elementVal) | |
#final processing - remove values from list if they are the only list element | |
#(don't return single element lists) | |
relatedInfoDict = { key : unlistIfSingle(val) for key,val in relatedInfoDict.items()} | |
return relatedInfoDict | |
def parseObjectPage(htmlText): | |
""" | |
Extract object info from given html, returning dictionary of parsed info. | |
""" | |
soup = BeautifulSoup(htmlText) | |
#Two main fields to grab: the specifications table, and the related info | |
specTable = soup.find('table', class_='specifications') | |
specDict = parseTable(specTable) | |
relatedInfoDiv = soup.find('div', class_='related-info') | |
relatedInfoDict = parseRelatedInfo(relatedInfoDiv) | |
#merge parsed results to single dict | |
allInfoDict = specDict.copy() | |
allInfoDict.update(relatedInfoDict) | |
return allInfoDict | |
def getObjectInfo(objectIndex): | |
""" | |
Get and parse data for object of given index. | |
E.g: | |
>>> getObjectInfo(46209) | |
{'Belonged to:': 'Seddon, Richard', | |
'Classification': 'swords', | |
'Credit line': 'Gift of Dame Elizabeth Knox Gilmer, 1955', | |
'Dimensions': ['Overall: \r\n945mm (Length) \r\n x 110mm (Width) \r\n x 60mm (Depth)', | |
'Overall: \r\n830mm (Length) \r\n x 25mm (Width) \r\n x 20mm (Depth)', | |
'Overall: \r\n1010mm (Length) \r\n x 170mm (Width) \r\n x 15mm (Depth)', | |
'Overall: \r\n950mm (Length) \r\n x 180mm (Width) \r\n x 35mm (Depth)', | |
'Overall: \r\n990mm (Length) \r\n x 110mm (Width) \r\n x 60mm (Depth)'], | |
'Made by:': 'Hill Brothers', | |
'Made in:': 'England (United Kingdom)', | |
'Made of:': 'steel', | |
'Materials': 'steel', | |
'Medium summary': 'Blade is steel, hilt is gilt brass.', | |
'Part of:': 'History collection', | |
'Production': 'Hill Brothers (manufacturer(s)), 1897, England', | |
'Refers to:': 'Hill Brothers', | |
'Registration number': 'PC000760', | |
'Title': 'Ceremonial sword ("Court Sword").', | |
'Type of:': 'swords', | |
'objectIndex': 46209, | |
'objectUrl': 'http://collections.tepapa.govt.nz/Object/46209'} | |
""" | |
#print(objectIndex) | |
url = 'http://collections.tepapa.govt.nz/Object/'+str(objectIndex) | |
success = False | |
while not success: | |
try: | |
r = requests.get(url) | |
success = True | |
except: | |
print('Request failed. Retrying...') | |
pageResults = parseObjectPage(r.text) | |
pageResults['objectIndex'] = objectIndex | |
pageResults['objectUrl'] = url | |
return pageResults | |
def runChunkedRequests(): | |
""" | |
Get info for objects in groups of 100, saving a .json output file for each group, for a total of 10,000 objects. | |
""" | |
for i in range(100): | |
iChunkResults = [getObjectInfo(j) for j in range(i*100, (i+1)*100)] | |
outfileName = 'chunk'+str(i)+'.json' | |
with open(outfileName, 'w') as outfile: | |
json.dump(iChunkResults, outfile) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment