Skip to content

Instantly share code, notes, and snippets.

@mrdmnd
Created August 26, 2012 00:26
Show Gist options
  • Save mrdmnd/3472770 to your computer and use it in GitHub Desktop.
Save mrdmnd/3472770 to your computer and use it in GitHub Desktop.
TCG Player deck scraper
# Copyright 2012 Matt Redmond
__author__ = "[email protected]"
import mechanize
from Queue import Queue, Empty
from threading import Thread
import re
queue = Queue()
break_regex = re.compile("<br>", re.IGNORECASE)
deckid_regex = re.compile(r"\\?deck_id=[0-9]{7}")
md = "<b>Main Deck:</b><br>"
sb = "<b>Sideboard:</b><br>"
def reader():
try:
while True:
url = queue.get_nowait()
print 'Getting url %s' % url
content = mechanize.urlopen(url).read()
deck = parse_content(content)
out_file = open("decklists/"+url[-7:],'w')
out_file.write(deck)
out_file.close()
except Empty, KeyboardInterrupt:
pass
def parse_content(content):
# Get rid of troublesome whitespace.
content = content.replace("\r\n", "").replace("\n", "").replace("\t","")
# Parse out Maindeck from Sideboard
maindeck_html = content[content.find(md)+len(md) : content.find(sb)]
sideboard_html = content[content.find(sb)+len(sb) : ]
md_cards = filter(lambda s: s and "?" not in s, break_regex.split(maindeck_html))
sb_cards = filter(lambda s: s and "?" not in s, break_regex.split(sideboard_html)[:-1])
# Build return string
s = ""
s += "Mainboard:\n"
for card in md_cards:
s += card + "\n"
s += "Sideboard:\n"
for card in sb_cards:
s += card + "\n"
return s
def populate_queue(id_strings):
# Fill worker thread queue with URLs to get.
for line in id_strings:
queue.put("http://magic.tcgplayer.com/db/deck_print.asp?"+line)
def get_search_results(game_format):
# Uses "mechanize" package to simulate clicking through TCGPlayer decklist.
br = mechanize.Browser()
br.open("http://magic.tcgplayer.com/db/deck_search.asp")
br.select_form(nr=1)
# Select the format to use. I've selected M13 standard. Other options are
# "ISD Block"
# "Extended"
# "Modern"
# "Legacy"
# "Vintage"
# "Commander"
# There are other options, see http://magic.tcgplayer.com/db/deck_search.asp and view source for more.
br["Format"] = [game_format]
# Constrain the search to decks that have Top-8'ed. This can be removed easily.
br["Place"] = ["1st", "2nd", "3rd - 4th", "5th - 8th"]
resp = br.submit()
stuff = resp.read()
# The website places some random deck lists on the page, we want to strip those.
stuff = stuff.partition("LATEST COMMUNITY DECKS")[0] # Get rid of the random decks they show.
id_strings = deckid_regex.findall(stuff)
return id_strings
if __name__ == '__main__':
id_strings = get_search_results("Type II - M13")
populate_queue(id_strings)
workers = []
# Use 8 downloading and parsing threads to speed things up.
for i in range(8):
worker = Thread(target = reader)
worker.start()
workers.append(worker)
for worker in workers:
worker.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment