Created
August 31, 2013 20:46
-
-
Save danslimmon/6400533 to your computer and use it in GitHub Desktop.
Goes through the HTML files in the directory /Users/dan/j-archive/html, which have come from http://www.j-archive.com/showgame.php?game_id=<NUMBER> and are called <NUMBER>.html, and parses out the questions, answers, categories, etc. Generates SQL which can be loaded into a SQLite database.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'rubygems' | |
require 'nokogiri' | |
class Clue | |
attr_accessor :text, :answer, :value, :category, :round, :difficulty, :shownumber | |
end | |
def escape(text) | |
t = text.clone | |
# Get rid of single-quotes already escaped for JS | |
t.gsub!(/\\'/, "'") | |
# Escape single-quotes | |
t.gsub!(/'/, "''") | |
t | |
end | |
def write_round(page, round_name) | |
title = page.css("title") | |
title.text =~ /Show #(\d+)/ | |
show_number = $1 | |
category_cells = page.css("div##{round_name} td.category_name") | |
clue_tables = page.css("div##{round_name} td.clue>table") | |
clue_tables.each do |clue_table| | |
c = Clue.new | |
# Populate the clue text | |
clue_text_cell = clue_table.css("td.clue_text").first | |
clue_id = clue_text_cell["id"] | |
c.text = clue_text_cell.text | |
# Populate the clue category | |
clue_id =~ /^clue_([FD]?J)_(\d)_(\d)/ | |
category_num = $2.to_i - 1 | |
c.category = category_cells[category_num].text | |
# Populate the clue round and difficulty | |
c.round = $1 | |
c.difficulty = $3 | |
# Populate the clue shownumber | |
c.shownumber = show_number | |
# Populate the clue value | |
values = clue_table.css("td.clue_value") | |
dd_values = clue_table.css("td.clue_value_daily_double") | |
if values.length > 0 | |
c.value = values.first.text | |
elsif dd_values.length > 0 | |
c.value = "$0" | |
else | |
next | |
end | |
# Populate the clue answer | |
shitty_div = clue_table.css("div").first | |
shitty_div["onmouseover"] =~ /correct_response">(.*?)<\// | |
answer = $1 | |
answer = answer.downcase | |
answer.sub!(/^<i>/, "") | |
answer.sub!(/^"/, "") | |
answer.sub!(/"$/, "") | |
c.answer = answer | |
puts "INSERT INTO clue (text, answer, value, category, round, difficulty, shownumber) VALUES | |
('%s', '%s', '%s', '%s', '%s', '%s', '%s');" % [ | |
escape(c.text), | |
escape(c.answer), | |
c.value, | |
escape(c.category), | |
c.round, | |
c.difficulty, | |
c.shownumber | |
] | |
end | |
end | |
def write_fj(page) | |
title = page.css("title") | |
title.text =~ /Show #(\d+)/ | |
show_number = $1 | |
category_cell = page.css("div#final_jeopardy_round td.category_name") | |
clue_table = page.css("div#final_jeopardy_round>table").first | |
return if clue_table.nil? | |
c = Clue.new | |
# Populate the clue text | |
clue_text_cell = clue_table.css("td.clue_text").first | |
clue_id = clue_text_cell["id"] | |
c.text = clue_text_cell.text | |
# Populate the clue category | |
c.category = clue_table.css("td.category_name").text | |
# Populate the clue round | |
c.round = "FJ" | |
c.difficulty = "0" | |
# Populate the clue shownumber | |
c.shownumber = show_number | |
# Populate the clue value | |
c.value = "$0" | |
# Populate the clue answer | |
shitty_div = clue_table.css("div").first | |
shitty_div["onmouseover"] =~ /correct_response\\">(.*?)<\// | |
answer = $1 | |
answer = answer.downcase | |
answer.sub!(/^<i>/, "") | |
answer.sub!(/^"/, "") | |
answer.sub!(/"$/, "") | |
c.answer = answer | |
puts "INSERT INTO clue (text, answer, value, category, round, difficulty, shownumber) VALUES | |
('%s', '%s', '%s', '%s', '%s', '%s', '%s');" % [ | |
escape(c.text), | |
escape(c.answer), | |
c.value, | |
escape(c.category), | |
c.round, | |
c.difficulty, | |
c.shownumber | |
] | |
end | |
puts "CREATE TABLE clue (text STRING, answer STRING, value STRING, category STRING, round STRING, difficulty STRING, shownumber STRING);" | |
Dir.foreach("/Users/dan/j-archive/html") do |f| | |
next unless (f =~ /\.html$/) | |
next unless (File.size("/Users/dan/j-archive/html/#{f}") > 0) | |
page = Nokogiri::HTML(open("/Users/dan/j-archive/html/#{f}", "r")) | |
next if page.css("div#jeopardy_round td.clue>table").empty? | |
["jeopardy_round", "double_jeopardy_round"].each do |round_name| | |
write_round(page, round_name) | |
end | |
write_fj(page) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment