Created
August 26, 2012 00:26
-
-
Save mrdmnd/3472770 to your computer and use it in GitHub Desktop.
TCG Player deck scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2012 Matt Redmond | |
__author__ = "[email protected]" | |
import mechanize | |
from Queue import Queue, Empty | |
from threading import Thread | |
import re | |
queue = Queue() | |
break_regex = re.compile("<br>", re.IGNORECASE) | |
deckid_regex = re.compile(r"\\?deck_id=[0-9]{7}") | |
md = "<b>Main Deck:</b><br>" | |
sb = "<b>Sideboard:</b><br>" | |
def reader(): | |
try: | |
while True: | |
url = queue.get_nowait() | |
print 'Getting url %s' % url | |
content = mechanize.urlopen(url).read() | |
deck = parse_content(content) | |
out_file = open("decklists/"+url[-7:],'w') | |
out_file.write(deck) | |
out_file.close() | |
except Empty, KeyboardInterrupt: | |
pass | |
def parse_content(content): | |
# Get rid of troublesome whitespace. | |
content = content.replace("\r\n", "").replace("\n", "").replace("\t","") | |
# Parse out Maindeck from Sideboard | |
maindeck_html = content[content.find(md)+len(md) : content.find(sb)] | |
sideboard_html = content[content.find(sb)+len(sb) : ] | |
md_cards = filter(lambda s: s and "?" not in s, break_regex.split(maindeck_html)) | |
sb_cards = filter(lambda s: s and "?" not in s, break_regex.split(sideboard_html)[:-1]) | |
# Build return string | |
s = "" | |
s += "Mainboard:\n" | |
for card in md_cards: | |
s += card + "\n" | |
s += "Sideboard:\n" | |
for card in sb_cards: | |
s += card + "\n" | |
return s | |
def populate_queue(id_strings): | |
# Fill worker thread queue with URLs to get. | |
for line in id_strings: | |
queue.put("http://magic.tcgplayer.com/db/deck_print.asp?"+line) | |
def get_search_results(game_format): | |
# Uses "mechanize" package to simulate clicking through TCGPlayer decklist. | |
br = mechanize.Browser() | |
br.open("http://magic.tcgplayer.com/db/deck_search.asp") | |
br.select_form(nr=1) | |
# Select the format to use. I've selected M13 standard. Other options are | |
# "ISD Block" | |
# "Extended" | |
# "Modern" | |
# "Legacy" | |
# "Vintage" | |
# "Commander" | |
# There are other options, see http://magic.tcgplayer.com/db/deck_search.asp and view source for more. | |
br["Format"] = [game_format] | |
# Constrain the search to decks that have Top-8'ed. This can be removed easily. | |
br["Place"] = ["1st", "2nd", "3rd - 4th", "5th - 8th"] | |
resp = br.submit() | |
stuff = resp.read() | |
# The website places some random deck lists on the page, we want to strip those. | |
stuff = stuff.partition("LATEST COMMUNITY DECKS")[0] # Get rid of the random decks they show. | |
id_strings = deckid_regex.findall(stuff) | |
return id_strings | |
if __name__ == '__main__': | |
id_strings = get_search_results("Type II - M13") | |
populate_queue(id_strings) | |
workers = [] | |
# Use 8 downloading and parsing threads to speed things up. | |
for i in range(8): | |
worker = Thread(target = reader) | |
worker.start() | |
workers.append(worker) | |
for worker in workers: | |
worker.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment