mrdmnd · August 26, 2012 00:26
diff --git a/decklist_dl.py b/decklist_dl.py
 # Copyright 2012 Matt Redmond
 __author__ = "[email protected]"

 import mechanize
 from Queue import Queue, Empty
 from threading import Thread
 import re

 queue = Queue()
 break_regex = re.compile("<br>", re.IGNORECASE)
 deckid_regex = re.compile(r"\\?deck_id=[0-9]{7}")
 md = "<b>Main Deck:</b><br>"
 sb = "<b>Sideboard:</b><br>"

 def reader():
  try:
    while True:
      url = queue.get_nowait()
      print 'Getting url %s' % url
      content = mechanize.urlopen(url).read()
      deck = parse_content(content)
      out_file = open("decklists/"+url[-7:],'w')
      out_file.write(deck)
      out_file.close()
  except Empty, KeyboardInterrupt:
    pass

 def parse_content(content):
  # Get rid of troublesome whitespace.
  content = content.replace("\r\n", "").replace("\n", "").replace("\t","")
  # Parse out Maindeck from Sideboard
  maindeck_html = content[content.find(md)+len(md) : content.find(sb)]
  sideboard_html = content[content.find(sb)+len(sb) : ]
  md_cards = filter(lambda s: s and "?" not in s, break_regex.split(maindeck_html))
  sb_cards = filter(lambda s: s and "?" not in s, break_regex.split(sideboard_html)[:-1])
  # Build return string
  s = ""
  s += "Mainboard:\n"
  for card in md_cards:
    s += card + "\n"
  s += "Sideboard:\n"
  for card in sb_cards:
    s += card + "\n"
  return s
 def populate_queue(id_strings):
    # Fill worker thread queue with URLs to get.
    for line in id_strings:
      queue.put("http://magic.tcgplayer.com/db/deck_print.asp?"+line)
 def get_search_results(game_format):
  # Uses "mechanize" package to simulate clicking through TCGPlayer decklist.
  br = mechanize.Browser()
  br.open("http://magic.tcgplayer.com/db/deck_search.asp")
  br.select_form(nr=1)
  # Select the format to use. I've selected M13 standard. Other options are
  # "ISD Block"
  # "Extended"
  # "Modern"
  # "Legacy"
  # "Vintage"
  # "Commander"
  # There are other options, see http://magic.tcgplayer.com/db/deck_search.asp and view source for more.
  br["Format"] = [game_format]
  # Constrain the search to decks that have Top-8'ed. This can be removed easily.
  br["Place"] = ["1st", "2nd", "3rd - 4th", "5th - 8th"]
  resp = br.submit()
  stuff = resp.read()
  # The website places some random deck lists on the page, we want to strip those.
  stuff = stuff.partition("LATEST COMMUNITY DECKS")[0] # Get rid of the random decks they show.
  id_strings = deckid_regex.findall(stuff)
  return id_strings
 if __name__ == '__main__':
  id_strings = get_search_results("Type II - M13")
  populate_queue(id_strings)
  workers = []
  # Use 8 downloading and parsing threads to speed things up.
  for i in range(8):
    worker = Thread(target = reader)
    worker.start()
    workers.append(worker)
  for worker in workers:
    worker.join()
	# Copyright 2012 Matt Redmond
	__author__ = "[email protected]"

	import mechanize
	from Queue import Queue, Empty
	from threading import Thread
	import re

	queue = Queue()
	break_regex = re.compile("<br>", re.IGNORECASE)
	deckid_regex = re.compile(r"\\?deck_id=[0-9]{7}")
	md = "<b>Main Deck:</b><br>"
	sb = "<b>Sideboard:</b><br>"

	def reader():
	try:
	while True:
	url = queue.get_nowait()
	print 'Getting url %s' % url
	content = mechanize.urlopen(url).read()
	deck = parse_content(content)
	out_file = open("decklists/"+url[-7:],'w')
	out_file.write(deck)
	out_file.close()
	except Empty, KeyboardInterrupt:
	pass

	def parse_content(content):
	# Get rid of troublesome whitespace.
	content = content.replace("\r\n", "").replace("\n", "").replace("\t","")
	# Parse out Maindeck from Sideboard
	maindeck_html = content[content.find(md)+len(md) : content.find(sb)]
	sideboard_html = content[content.find(sb)+len(sb) : ]
	md_cards = filter(lambda s: s and "?" not in s, break_regex.split(maindeck_html))
	sb_cards = filter(lambda s: s and "?" not in s, break_regex.split(sideboard_html)[:-1])
	# Build return string
	s = ""
	s += "Mainboard:\n"
	for card in md_cards:
	s += card + "\n"
	s += "Sideboard:\n"
	for card in sb_cards:
	s += card + "\n"
	return s
	def populate_queue(id_strings):
	# Fill worker thread queue with URLs to get.
	for line in id_strings:
	queue.put("http://magic.tcgplayer.com/db/deck_print.asp?"+line)
	def get_search_results(game_format):
	# Uses "mechanize" package to simulate clicking through TCGPlayer decklist.
	br = mechanize.Browser()
	br.open("http://magic.tcgplayer.com/db/deck_search.asp")
	br.select_form(nr=1)
	# Select the format to use. I've selected M13 standard. Other options are
	# "ISD Block"
	# "Extended"
	# "Modern"
	# "Legacy"
	# "Vintage"
	# "Commander"
	# There are other options, see http://magic.tcgplayer.com/db/deck_search.asp and view source for more.
	br["Format"] = [game_format]
	# Constrain the search to decks that have Top-8'ed. This can be removed easily.
	br["Place"] = ["1st", "2nd", "3rd - 4th", "5th - 8th"]
	resp = br.submit()
	stuff = resp.read()
	# The website places some random deck lists on the page, we want to strip those.
	stuff = stuff.partition("LATEST COMMUNITY DECKS")[0] # Get rid of the random decks they show.
	id_strings = deckid_regex.findall(stuff)
	return id_strings
	if __name__ == '__main__':
	id_strings = get_search_results("Type II - M13")
	populate_queue(id_strings)
	workers = []
	# Use 8 downloading and parsing threads to speed things up.
	for i in range(8):
	worker = Thread(target = reader)
	worker.start()
	workers.append(worker)
	for worker in workers:
	worker.join()