Last active
September 5, 2018 10:16
-
-
Save seaneshbaugh/1310839 to your computer and use it in GitHub Desktop.
MTG scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'active_support/core_ext/string' | |
| require 'mechanize' | |
| require 'optparse' | |
| require 'ostruct' | |
| require 'pry' | |
| require 'tempfile' | |
| class CardSet | |
| attr_accessor :name, :cards | |
| def initialize | |
| self.cards = Array.new | |
| end | |
| end | |
| class Card | |
| attr_accessor :multiverse_id, :name, :mana_cost, :converted_mana_cost, :card_type, :card_text, :flavor_text, :power, :toughness, :loyalty, :rarity, :card_number, :artist | |
| end | |
| class Gatherer | |
| attr_accessor :sets, :agent, :options | |
| Usage = | |
| <<-eos | |
| Usage: | |
| gatherer.rb [options] | |
| Options: | |
| -f, [--force] # Ignore file collisions | |
| -s, [--sets=SETS] # Comma delimeted list of sets to retrieve | |
| -S, [--skip] # Skip file collisions | |
| -p, [--pretend] # Run but do not output any files | |
| -q, [--quiet] # Supress status output | |
| -V, [--verbose] # Show extra output | |
| -h, [--help] # Show this help message and quit | |
| -v, [--version] # Show gatherer.rb version number and quit | |
| Description: | |
| Scrapes gatherer.wizards.com for MTG sets and cards. Outputs the | |
| results as SQL files; one for the list of sets and then one for | |
| each set. | |
| Examples: | |
| gatherer.rb | |
| This retrieves all MTG sets. | |
| gatherer.rb -s "Alliances,Future Sight" | |
| This retrieves the "Alliances" and "Future Sight" sets. | |
| eos | |
| Version = '0.1.0' | |
| def initialize | |
| @options = OpenStruct.new | |
| @options.force = false | |
| @options.sets = Array.new | |
| @options.pretend = false | |
| @options.quiet = false | |
| @options.verbose = false | |
| @options.help = false | |
| @options.version = false | |
| @sets = Array.new | |
| @agent = Mechanize.new | |
| end | |
| def parse_options(argv) | |
| begin | |
| op = OptionParser.new do |ops| | |
| ops.banner = Usage | |
| ops.separator('') | |
| ops.on('-f', '--force', 'Ignore file collisions') do |force| | |
| @options.force = force | |
| end | |
| ops.on('-s', '--sets SETS', 'Comma delimeted list of sets to retrieve') do |sets| | |
| @options.sets = sets.split(',') | |
| end | |
| ops.on('-S', '--skip', 'Skip file collisions') do |skip| | |
| @options.skip = skip | |
| end | |
| ops.on('-p', '--pretend', 'Run but do not output any files') do |pretend| | |
| @options.pretend = pretend | |
| end | |
| ops.on('-q', '--quiet', 'Supress status output') do |quiet| | |
| @options.quiet = quiet | |
| end | |
| ops.on('-V', '--verbose', 'Show extra output') do |verbose| | |
| @options.verbose = verbose | |
| end | |
| ops.on('-h', '--help', 'Show this help message and quit') do |help| | |
| puts Usage | |
| exit | |
| end | |
| ops.on('-v', '--version', 'Show gatherer.rb version number and quit') do |version| | |
| puts "gatherer.rb #{Version}" | |
| exit | |
| end | |
| end | |
| op.parse!(argv) | |
| rescue Exception => exception | |
| puts "Error: #{exception.message}\n\n" | |
| puts Usage | |
| exit | |
| end | |
| end | |
| def get_card_sets | |
| if @options.sets.present? | |
| set_names = @options.sets | |
| else | |
| search_page = @agent.get('http://gatherer.wizards.com/Pages/Default.aspx') | |
| set_names = search_page.parser.css('select#ctl00_ctl00_MainContent_Content_SearchControls_setAddText').children.map { |set_name| set_name.attributes['value'].text } | |
| end | |
| set_names.each do |set_name| | |
| set = CardSet.new | |
| set.name = set_name.strip | |
| if set.name.present? | |
| @sets << set | |
| end | |
| end | |
| end | |
| def output_card_sets_list_as_sql | |
| write_file('sets.sql') do |file| | |
| @sets.each do |set| | |
| name = set.name.clone | |
| if !name.nil? | |
| name.gsub!(/'/, "''") | |
| name.strip! | |
| end | |
| file.puts "INSERT INTO CARD_SETS (name) VALUES ('#{name}');" | |
| end | |
| end | |
| end | |
| def output_card_sets_as_sql | |
| @sets.each do |set| | |
| output_card_set_as_sql(set) | |
| end | |
| end | |
| def output_card_set_as_sql(set) | |
| write_file("#{set.name.gsub(/'/, '').parameterize.underscore}.sql") do |file| | |
| set.cards.sort { |a, b| a.multiverse_id.to_i <=> b.multiverse_id.to_i }.each do |card| | |
| multiverse_id = card.multiverse_id | |
| if multiverse_id.present? | |
| multiverse_id = multiverse_id.gsub(/'/, "''").strip | |
| end | |
| name = card.name | |
| if name.present? | |
| name = name.gsub(/'/, "''").strip | |
| end | |
| mana_cost = card.mana_cost | |
| if mana_cost.present? | |
| mana_cost = mana_cost.gsub(/'/, "''").strip | |
| end | |
| converted_mana_cost = card.converted_mana_cost | |
| if converted_mana_cost.present? | |
| converted_mana_cost = converted_mana_cost.gsub(/'/, "''").strip | |
| end | |
| card_type = card.card_type | |
| if card_type.present? | |
| card_type = card_type.gsub(/'/, "''").strip | |
| end | |
| card_text = card.card_text | |
| if card_text.present? | |
| card_text = card_text.gsub(/'/, "''").strip | |
| end | |
| flavor_text = card.flavor_text | |
| if flavor_text.present? | |
| flavor_text = flavor_text.gsub(/'/, "''").strip | |
| end | |
| power = card.power | |
| if power.present? | |
| power = power.gsub(/'/, "''").strip | |
| end | |
| toughness = card.toughness | |
| if toughness.present? | |
| toughness = toughness.gsub(/'/, "''").strip | |
| end | |
| loyalty = card.loyalty | |
| if loyalty.present? | |
| loyalty = loyalty.gsub(/'/, "''").strip | |
| end | |
| rarity = card.rarity | |
| if rarity.present? | |
| rarity = rarity.gsub(/'/, "''").strip | |
| end | |
| card_number = card.card_number | |
| if card_number.present? | |
| card_number = card_number.gsub(/'/, "''").strip | |
| end | |
| artist = card.artist | |
| if artist.present? | |
| artist = artist.gsub(/'/, "''").strip | |
| end | |
| file.puts "INSERT INTO CARDS (multiverse_id, name, mana_cost, converted_mana_cost, card_type, card_text, flavor_text, power, toughness, loyalty, rarity, card_number, artist) VALUES ('#{multiverse_id}', '#{name}', '#{mana_cost}', '#{converted_mana_cost}', '#{card_type}', '#{card_text}', '#{flavor_text}', '#{power}', '#{toughness}', '#{loyalty}', '#{rarity}', '#{card_number}', '#{artist}');" | |
| end | |
| end | |
| end | |
| def get_cards | |
| @sets.each do |set| | |
| set_page_uri = "http://gatherer.wizards.com/Pages/Search/Default.aspx?sort=color+&set=[%22#{CGI::escape(set.name)}%22]" | |
| begin | |
| set_page = agent.get(set_page_uri) | |
| card_containers = set_page.parser.css('.cardItem') | |
| card_containers.each do |card_container| | |
| card_links = card_container.css('.cardTitle a') | |
| card_links.each do |card_link| | |
| card = get_card("http://gatherer.wizards.com/Pages/#{card_link.attributes['href'].text[3, (card_link.attributes['href'].text.length - 3)]}") | |
| set.cards << card | |
| end | |
| other_version_links = card_container.css('.setVersions .otherSetSection a') | |
| other_version_links.each do |other_version_link| | |
| if other_version_link.css('img').length > 0 | |
| if other_version_link.css('img')[0].attribute('alt').value.match(set.name) | |
| other_version = get_card("http://gatherer.wizards.com/Pages/#{other_version_link.attributes['href'].text[3, (other_version_link.attributes['href'].text.length - 3)]}") | |
| set.cards << other_version | |
| end | |
| end | |
| end | |
| end | |
| next_page_uri = '' | |
| if set_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_topPagingControlsContainer a").length > 0 | |
| pagination_links = set_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_topPagingControlsContainer a") | |
| pagination_links.each do |pagination_link| | |
| if pagination_link.children[0].text[-1].strip == '>' | |
| next_page_uri = "http://gatherer.wizards.com#{pagination_link.attributes['href'].text}" | |
| break | |
| end | |
| end | |
| end | |
| if next_page_uri != set_page_uri && next_page_uri.present? | |
| set_page_uri = next_page_uri | |
| else | |
| set_page_uri = '' | |
| end | |
| end while set_page_uri != '' | |
| end | |
| end | |
| def get_card(card_page_uri) | |
| card_page = agent.get(card_page_uri) | |
| card = Card.new | |
| if card_page.parser.css("#aspnetForm").length > 0 | |
| multiverse_id = card_page.parser.css("#aspnetForm").attribute("action").text | |
| card.multiverse_id = multiverse_id[multiverse_id.index('=') + 1..-1] | |
| else | |
| card.multiverse_id = '' | |
| end | |
| if card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value").length > 0 | |
| card.name = card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value").text.strip | |
| else | |
| card.name = '' | |
| end | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow .value img').length > 0 | |
| mana_cost = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow .value img') | |
| mana = Array.new | |
| mana_cost.each do |mc| | |
| mana << mc.attributes['alt'].text | |
| end | |
| card.mana_cost = mana.join(';') | |
| else | |
| card.mana_cost = '' | |
| end | |
| if card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow .value").length > 0 | |
| card.converted_mana_cost = card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow .value").text.strip | |
| else | |
| card.converted_mana_cost = '' | |
| end | |
| if card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow .value").length > 0 | |
| card.card_type = PP.pp(card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow .value").text.strip, '') | |
| card.card_type = card.card_type.gsub(/\n/, '').gsub(/"/, '').gsub(/ \\342\\200\\224 /, " — ") | |
| else | |
| card.card_type = '' | |
| end | |
| if card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow .value").length > 0 | |
| card_text = card_page.parser.css("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow .value").children.to_s.strip | |
| card_text_doc = Nokogiri::HTML(card_text) | |
| card_text_doc.xpath("//img").each do |img| | |
| src = img['src'] | |
| name = src.scan(/name=(.*)&/) | |
| img['src'] = "/assets/symbols/#{name[0][0]}.png" | |
| img.xpath('//@align').each(&:remove) | |
| end | |
| card_text_box = card_text_doc.xpath("//div[@class='cardtextbox']") | |
| card.card_text = card_text_box.inner_html.gsub(/\n/, "").gsub(/<i>/, "<em>").gsub(/<\/i>/, "</em>") | |
| else | |
| card.card_text = '' | |
| end | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow .value').length > 0 | |
| flavor_text = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow .value').children.to_s.strip | |
| flavor_text_doc = Nokogiri::HTML(flavor_text) | |
| flavor_text_box = flavor_text_doc.xpath("//div[@class='cardtextbox']") | |
| card.flavor_text = flavor_text_box.inner_html.gsub(/<i>/, "<em>").gsub(/<\/i>/, "</em>") | |
| else | |
| card.flavor_text = '' | |
| end | |
| card.power = '' | |
| card.toughness = '' | |
| card.loyalty = '' | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow').length > 0 | |
| if card.card_type =~ /Planeswalker/ | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .label').length > 0 | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .label').text.strip =~ /Loyalty:/ | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value').length > 0 | |
| card.loyalty = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value').text.strip | |
| end | |
| end | |
| end | |
| else | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .label').length > 0 | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .label').text.strip =~ /P\/T:/ | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value').length > 0 | |
| card.power = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value').text.strip.split('/')[0].strip | |
| card.toughness = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value').text.strip.split('/')[1].strip | |
| end | |
| end | |
| end | |
| end | |
| end | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow .value').length > 0 | |
| card.rarity = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow .value').text.strip | |
| else | |
| card.rarity = '' | |
| end | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow .value').length > 0 | |
| card.card_number = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow .value').text.strip | |
| else | |
| card.card_number = '' | |
| end | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow .value').length > 0 | |
| card.artist = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow .value').text.strip | |
| else | |
| card.artist = '' | |
| end | |
| set_name = '' | |
| if card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_currentSetSymbol a').length > 0 | |
| set_name = card_page.parser.css('#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_currentSetSymbol a')[1].text.strip | |
| end | |
| if options.verbose | |
| puts "Retrieved card #{card.multiverse_id} \"#{card.name}\" #{set_name.present? ? "(#{set_name})" : ''}" | |
| end | |
| card | |
| end | |
| private | |
| def write_file(destination, &block) | |
| destination_exists = File.exist?(destination) | |
| tempfile = nil | |
| Tempfile.open(File.basename(destination)) do |temp| | |
| yield temp | |
| tempfile = temp | |
| end | |
| if destination_exists && identical?(tempfile.path, destination) | |
| puts "#{destination} is identical, ignoring." | |
| return false | |
| end | |
| if destination_exists | |
| if @options.skip | |
| puts "#{destination} exists, skipping." | |
| return false | |
| end | |
| if @options.force | |
| File.open(destination, 'w') do |file| | |
| yield file | |
| end | |
| return true | |
| end | |
| puts "Warning: \"#{destination} already exists. Force overwrite? (enter \"h\" for help) [ynaqdh]" | |
| begin | |
| case gets.chomp | |
| when /\Ay\z/i | |
| puts "Overwriting #{destination}." | |
| File.open(destination, 'w') do |file| | |
| yield file | |
| end | |
| return true | |
| when /\An\z/i | |
| puts "Skipping #{destination}." | |
| return false | |
| when /\Aa\z/i | |
| @options.force = true | |
| File.open(destination, 'w') do |file| | |
| yield file | |
| end | |
| return true | |
| when /\Aq\z/i | |
| puts "Skipping #{destination} and exiting." | |
| exit | |
| when /\Ad\z/i | |
| puts 'diff not supported yet.' | |
| # Tempfile.open(File.basename(destination), File.dirname(dst)) do |temp| | |
| # temp.write render_file(src, file_options, &block) | |
| # temp.rewind | |
| # puts `diff -u #{dst} #{temp.path}` | |
| # end | |
| raise 'retry diff' | |
| else | |
| puts <<-HELP | |
| y - yes, overwrite | |
| n - no, do not overwrite | |
| a - all, overwrite this and all others | |
| q - quit, abort | |
| d - diff, show the differences between the old and the new | |
| h - help, show this help | |
| HELP | |
| raise 'retry help' | |
| end | |
| rescue | |
| retry | |
| end | |
| else | |
| File.open(destination, 'w') do |file| | |
| yield file | |
| end | |
| true | |
| end | |
| end | |
| def identical?(source, destination) | |
| return false if File.directory?(destination) | |
| source = IO.read(source) | |
| destination = IO.read(destination) | |
| source == destination | |
| end | |
| end | |
| gatherer = Gatherer.new | |
| gatherer.parse_options(ARGV) | |
| gatherer.get_card_sets | |
| gatherer.output_card_sets_list_as_sql | |
| gatherer.get_cards | |
| gatherer.output_card_sets_as_sql | |
| binding.pry |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment