Created
May 7, 2016 15:39
-
-
Save meew0/04b6ecc20169d1244b96421f98fb6c10 to your computer and use it in GitHub Desktop.
Ruby script to parse a dump of Bulbapedia's Pokémon pages into obtainability data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script parses a dump of Bulbapedia's Pokémon pages into a JSON file | |
# with details about what Pokémon are obtainable in respective regions | |
# (specifically, the latest series of games set in a specific region). | |
require 'nokogiri' | |
require 'json' | |
# An XML dump of all of Bulbapedia's Pokémon pages is required to exist at | |
# this path. It can be generated using this special page: | |
# http://bulbapedia.bulbagarden.net/wiki/Special:Export | |
# The file is about 17 MB as of Gen 6 (721 pages). | |
PATH = 'Bulbapedia-20160506234550.xml' | |
# Path where the output file will end up | |
RESULT_PATH = 'obtainability.json' | |
doc = Nokogiri::XML(File.open(PATH)) do |config| | |
# The xml file generated is valid so we can use strict mode to check for | |
# corruption etc. Enabling nonet just in case | |
config.strict.nonet | |
end | |
# Get all page objects | |
pages = doc.css('mediawiki page') | |
puts "#{pages.length} pages found (should be 721)." | |
# Remove the " (Pokémon)" at the end of page titles | |
def poke_for(title) | |
title.gsub(' (Pokémon)', '') | |
end | |
# Hash of what games correspond to what regions | |
RELEVANT_VERSIONS = { | |
'FireRed' => 'Kanto', | |
'LeafGreen' => 'Kanto', | |
'Diamond' => 'Sinnoh', | |
'Pearl' => 'Sinnoh', | |
'Platinum' => 'Sinnoh', | |
'HeartGold' => 'Johto', | |
'SoulSilver' => 'Johto', | |
'Black 2' => 'Unova', | |
'White 2' => 'Unova', | |
'X' => 'Kalos', | |
'Y' => 'Kalos', | |
'Omega Ruby' => 'Hoenn', | |
'Alpha Sapphire' => 'Hoenn' | |
}.freeze | |
# Array of regions | |
REGIONS = RELEVANT_VERSIONS.values.uniq.freeze | |
# Represents one instance of Bulbapedia's Availability template. It ignores | |
# data such as colour and focuses only on whether it is available, the versions | |
# in which it is (or isn't) available, and the specified area. | |
class AvailabilityTag | |
def initialize(raw, available, versions, area = nil) | |
@raw = raw | |
@available = available | |
@versions = versions | |
@area = area | |
end | |
# Readers | |
def available?; @available; end | |
attr_reader :area, :versions | |
# The region this availability tag is in (if its versions are | |
# FireRed/LeafGreen, this method will return 'Kanto'). | |
def region | |
if @versions.any? { |e| RELEVANT_VERSIONS.include?(e) } | |
puts "!!!WARNING!!! Version #{@versions.first} isn't relevant even though there are relevant ones in array #{@versions.inspect}." unless RELEVANT_VERSIONS.include?(@versions.first) | |
RELEVANT_VERSIONS[@versions.first] | |
else nil; end | |
end | |
# Regex for the key part of a version attribute ("v=", "v2=") | |
VERSION_REGEX = /^v\d?=/ | |
# Parses a raw template line into an instance of this class | |
def self.parse(raw) | |
# Strip the outer two brackets each, and the final newline | |
tag = raw[2..-4] | |
# Split the template into attributes | |
split = tag.split('|') | |
# Get an array of versions this template corresponds to | |
versions = split.select { |e| e =~ VERSION_REGEX }.map { |e| e.split('=')[1..-1].join('=') } | |
# Check availability - availability tags for unavailable Pokémon are | |
# suffixed with "/None" | |
available = !tag.include?('/None|') | |
area_index = split.index { |e| e.start_with? 'area=' } # Get index of first area attribute | |
area = if area_index | |
area_attr = split[area_index..-1].join('|') # Join all subsequent "attributes" together so we don't cut off in some sub-tag | |
area_attr[5..-1] | |
else nil; end | |
AvailabilityTag.new(raw, available, versions, area) | |
end | |
end | |
# Corresponds to the entirety of availability tags on a particular page | |
class AvailabilityTagSet | |
def initialize(tags) | |
@tags = tags | |
end | |
attr_reader :tags | |
# Returns a hash of region => { availability, area } for all six regions. | |
def regions | |
result = {} | |
REGIONS.each do |region| | |
# Find the relevant tag, i. e. the template tag that corresponds to the | |
# region we're checking in the current iteration and that is available. | |
# If no such tag exists (either there's no tag for the current region or | |
# there's no available tag for it) it will be nil | |
relevant_tag = @tags.find { |tag| tag.region == region && tag.available? } | |
result[region] = if relevant_tag | |
result[region] = { available: true, area: relevant_tag.area } | |
else | |
# If there's no tag for this particular region set as | |
# available, we check whether there's any tag for this | |
# region. | |
regional_tag = @tags.find { |tag| tag.region == region } | |
result[region] = { | |
available: false, | |
# If there is a tag for this region, | |
# we use its area (will be something | |
# like "Event" or "Trade"), if not, | |
# we set it to nil | |
area: regional_tag ? regional_tag.area : nil | |
} | |
end | |
end | |
result | |
end | |
# Parse an array of availability tag lines into an instance of this class | |
def self.parse(tags) | |
parsed = tags.map { |e| AvailabilityTag.parse(e) } | |
AvailabilityTagSet.new(parsed) | |
end | |
end | |
# Hash in which the resulting data will be stored | |
result_file = {} | |
pages.each do |e| | |
# Get the title and the Pokémon it represents | |
title = e.at_css('title').text | |
poke = poke_for(title) | |
puts "Parsing page #{title}, Pokémon is #{poke}" | |
# Get the page's content | |
content = e.at_css('text').text | |
# Then, get a list of lines from the content and filter out the ones that | |
# contain availability tags | |
avail_tags = content.lines.select { |e| e.start_with? '{{Availability/Entry' } | |
# Parse a tag set for the list of lines we retrieved earlier | |
tag_set = AvailabilityTagSet.parse(avail_tags) | |
# Set this Pokémon's region data to the tag set's | |
result_file[poke] = tag_set.regions | |
end | |
# Write to file. We're done | |
bytes = File.write(RESULT_PATH, result_file.to_json) | |
puts "Wrote availabilities to file '#{RESULT_PATH}' (#{bytes} bytes)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Amazing! May I ask where can I download the Bulbapedia dump?