Created
November 4, 2015 23:01
-
-
Save dyerw/96e8a0a02ee70ed6b7a2 to your computer and use it in GitHub Desktop.
Scrapes hearthpwn.com for decklist data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from bs4 import BeautifulSoup | |
PAGES_TO_SCRAPE = 10 | |
# Get HTML for all deck listing pages | |
htmls = [] | |
for x in range(PAGES_TO_SCRAPE): | |
print "SCRAPING PAGE " + str(x + 1) | |
response = urllib2.urlopen('http://hearthpwn.com/decks?filter-deck-tag=3&page=' + str(x + 1)) | |
html = response.read() | |
htmls += [html] | |
# Parse all pages for links to each deck | |
links = [] | |
for html in htmls: | |
soup = BeautifulSoup(html, 'html.parser') | |
deck_table = soup.find(attrs={'class' :'listing-decks'}) | |
rows = deck_table.tbody.find_all('tr') | |
for row in rows: | |
data = row.find_all('td') | |
for link in data[0].find_all('a'): | |
if '/decks/' in link['href']: | |
links += [link['href']] | |
# Grab the decklists from each deck link | |
decklists = [] | |
for link in links: | |
endpoint = link.split('/')[2] | |
deck_id = endpoint.split('-')[0] | |
print deck_id | |
response = urllib2.urlopen('http://hearthpwn.com/decks/' + deck_id + '/export/1') | |
deck_list = response.read() | |
decklists += [deck_list] | |
# Parse each decklist into python datastructures | |
def parse_cockatrice_export(decklist): | |
lines = decklist.split('\n') | |
deck = {} | |
for line in lines: | |
if line and line[0].isdigit(): | |
deck[line[2:-1]] = line[0] | |
return deck | |
parsed_decks = map(parse_cockatrice_export, decklists) | |
with open('deckdata.csv', 'w') as f: | |
for i, deck in enumerate(parsed_decks): | |
for card, number in deck.iteritems(): | |
f.write(','.join([str(i), card, number]) + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment