Created
September 30, 2014 04:51
-
-
Save blinks/9d783f0399617523a851 to your computer and use it in GitHub Desktop.
gather.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Gather cards from wizards.com into local Elasticsearch. | |
# Adam Blinkinsop <http://plus.google.com/+AdamBlinkinsop> | |
from elasticsearch import Elasticsearch | |
import bs4 | |
import re | |
import urllib | |
def main(args): | |
sets = args.sets | |
if not sets: | |
filename, headers = urllib.urlretrieve('http://gatherer.wizards.com/') | |
soup = bs4.BeautifulSoup(open(filename)) | |
sets = [t['value'] for t in soup.select( | |
'select#ctl00_ctl00_MainContent_Content_SearchControls_setAddText' | |
' > option') if t['value']] | |
es = Elasticsearch() | |
for name in sets: | |
for card in cardsOf(name): | |
es.index(index='cauldron', doc_type='card', id=card['cardTitle'], body=card) | |
def soupOf(name, output='standard', page=0): | |
url = ('http://gatherer.wizards.com/Pages/Search/Default.aspx?' + | |
urllib.urlencode({ | |
'output': output, 'page': page, | |
'set': '["%s"]' % name, | |
})) | |
filename, headers = urllib.urlretrieve(url) | |
return bs4.BeautifulSoup(open(filename)) | |
def cardsOf(name): | |
page = 0 | |
while True: | |
print name, page | |
soup = soupOf(name, page=page) | |
for item in soup.find_all('tr', class_='cardItem'): | |
yield { | |
'cardTitle': oracleOf(item, 'span.cardTitle'), | |
'manaCost': oracleOf(item, 'span.manaCost'), | |
'convertedManaCost': oracleOf(item, 'span.convertedManaCost'), | |
'typeLine': oracleOf(item, 'span.typeLine'), | |
'rulesText': oracleOf(item, 'div.rulesText'), | |
'setVersions': oracleOf(item, 'td.setVersions'), | |
} | |
n = soup.find('div', class_='pagingcontrols').find( | |
text=lambda s: s.endswith('>')) | |
if n is None or n.find_parent('a') is None: | |
return | |
page += 1 | |
def oracleOf(soup, selector): | |
soup = soup.select(selector)[0] | |
return re.sub(r' +', ' ', oracleOfHtml(soup)) | |
def oracleOfHtml(soup): | |
if isinstance(soup, bs4.NavigableString): | |
return unicode(soup).strip() | |
elif soup.name == 'img': | |
return u'{%s}' % soup['alt'] | |
else: | |
return ' '.join(oracleOfHtml(c) for c in soup.children).strip() | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser( | |
description='Gather cards from wizards.com into Elasticsearch') | |
parser.add_argument('sets', metavar='N', type=unicode, nargs='*', | |
help='a set to gather') | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TODO