Last active
January 18, 2016 12:33
-
-
Save YuheiNakasaka/aa75bbbb0d6072f97359 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# net keiba scraper | |
# https://github.com/stockedge/netkeiba-scraper のsbt "run extract"でエラーでるのでRubyでscrapingしてDBに突っ込むとこまでスクリプトにした。 | |
# 前提として、net_keiba_htmls配下にsbt "run scrapehtml"で取得したhtmlファイルをここに全部置くこと | |
# あとは下記のディレクトリ構成にして、mysqlでDBをつくって、以下のテーブル作って、このスクリプトを実行すればよいはず | |
# ディレクトリ構成 | |
# ./ | |
# |_ Gemfile | |
# |_ net_keiba_scraper.rb | |
# |_ config | |
# |_ database.yml | |
# |_ net_keiba_htmls | |
# Gemfile | |
# source 'https://rubygems.org' | |
# gem 'activerecord' | |
# gem 'mysql2' | |
# gem 'activerecord-import' | |
# DB | |
# CREATE TABLE `race_definitions` ( | |
# `race_id` bigint(20) DEFAULT NULL, | |
# `race_name` varchar(255) DEFAULT NULL, | |
# `surface` varchar(255) DEFAULT NULL, | |
# `distance` int(11) DEFAULT NULL, | |
# `weather` varchar(255) DEFAULT NULL, | |
# `surface_state` varchar(255) DEFAULT NULL, | |
# `race_start` varchar(255) DEFAULT NULL, | |
# `race_number` int(11) DEFAULT NULL, | |
# `surface_score` varchar(255) DEFAULT NULL, | |
# `date` varchar(255) DEFAULT NULL, | |
# `place_detail` varchar(255) DEFAULT NULL, | |
# `race_class` varchar(255) DEFAULT NULL | |
# ) ENGINE=InnoDB DEFAULT CHARSET=utf8; | |
# | |
# | |
# CREATE TABLE `race_results` ( | |
# `id` int(11) unsigned NOT NULL AUTO_INCREMENT, | |
# `race_definition_id` bigint(20) DEFAULT NULL, | |
# `order_of_finish` int(11) DEFAULT NULL, | |
# `frame_number` int(11) DEFAULT NULL, | |
# `horse_number` int(11) DEFAULT NULL, | |
# `horse_id` varchar(255) DEFAULT NULL, | |
# `sex` varchar(255) DEFAULT NULL, | |
# `age` int(11) DEFAULT NULL, | |
# `basis_weight` int(11) DEFAULT NULL, | |
# `jockey_id` varchar(255) DEFAULT NULL, | |
# `finishing_time` varchar(255) DEFAULT NULL, | |
# `length` varchar(255) DEFAULT NULL, | |
# `speed_figure` int(11) DEFAULT NULL, | |
# `pass` varchar(255) DEFAULT NULL, | |
# `last_phase` varchar(255) DEFAULT NULL, | |
# `odds` int(11) DEFAULT NULL, | |
# `popularity` int(11) DEFAULT NULL, | |
# `horse_weight` varchar(255) DEFAULT NULL, | |
# `trainer_id` varchar(255) DEFAULT NULL, | |
# `owner_id` varchar(255) DEFAULT NULL, | |
# `earning_money` varchar(255) DEFAULT NULL, | |
# PRIMARY KEY (`id`) | |
# ) ENGINE=InnoDB AUTO_INCREMENT=494700 DEFAULT CHARSET=utf8; | |
require 'bundler/setup' | |
require 'nokogiri' | |
require 'csv' | |
require 'kconv' | |
require 'active_record' | |
require 'mysql2' | |
require 'activerecord-import' | |
require 'yaml' | |
config = YAML.load_file("#{File.expand_path('../../', __FILE__)}/config/database.yml") | |
ActiveRecord::Base.establish_connection(config["db"]["development"]) | |
class RaceDefinition < ActiveRecord::Base | |
self.primary_key = :race_id | |
has_many :race_results, foreign_key: :race_definition_id, primary_key: :race_id | |
end | |
class RaceResult < ActiveRecord::Base | |
belongs_to :race_definition, foreign_key: :race_id | |
end | |
class NetKeibaScraper | |
def initialize(cwl=false) | |
@cwl = cwl | |
@output_file_dir = "#{File.expand_path('../../', __FILE__)}/net_keiba_htmls" | |
end | |
def run | |
if @cwl == true | |
log('Start') | |
race_info = [] | |
race_results = [] | |
# stopper = 0 | |
Dir.glob(@output_file_dir + "/*.html").each do |f| | |
# ざっくり | |
race_definition_id = File.basename(f).sub(/\.html/,'') | |
# next if stopper == 0 && race_definition_id != '201008050205' | |
# stopper = 1 | |
html = open(f, "r:binary").read | |
parsed_html = Nokogiri::HTML.parse(html.toutf8, nil, 'utf-8') | |
# 必要なデータをスクレイピング | |
next if race_definition_id.match(/(200808020398|200808020399)/) | |
log("race_definition_id: #{race_definition_id}") | |
race_info << get_race_definition(parsed_html, race_definition_id) | |
race_results << get_race_result(parsed_html, race_definition_id) | |
end | |
[race_info, race_results] | |
end | |
end | |
private | |
def get_race_definition(txt, race_definition_id) | |
results = {} | |
racedata = txt.search(".racedata.fc") | |
racedata_mini = txt.search(".smalltxt") | |
raceplaceinfo = txt.search(".result_table_02 tbody") | |
results[:race_definition_id] = race_definition_id | |
results[:race_name] = trim_all(racedata.search("h1").text) | |
results[:surface] = diary_snap_cut(racedata)[1] | |
results[:distance] = diary_snap_cut(racedata)[2] | |
results[:weather] = diary_snap_cut(racedata)[3] | |
results[:surface_state] = diary_snap_cut(racedata)[4] | |
results[:race_start] = diary_snap_cut(racedata)[5] | |
results[:race_number] = trim_all(racedata.search("dt").text).sub(/R/,"") | |
results[:surface_score] = trim_all(raceplaceinfo.search("tr td")[0].text.sub(/[^\d]*\?[^\d]*/,'')) | |
results[:date] = racemeta(racedata_mini)[0] | |
results[:place_detail] = racemeta(racedata_mini)[1] | |
results[:race_class] = trim_all(racemeta(racedata_mini)[2]) | |
results | |
end | |
def get_race_result(txt, race_definition_id) | |
results = [] | |
txt.search("table.race_table_01 tr").each_with_index do |tr, i| | |
next if i == 0 | |
result = {} | |
result[:race_definition_id] = race_definition_id | |
result[:order_of_finish] = trim_all( tr.search("td")[0].text ) | |
result[:frame_number] = trim_all( tr.search("td")[1].text ) | |
result[:horse_number] = trim_all( tr.search("td")[2].text ) | |
result[:horse_id] = horse_id(tr) | |
result[:sex] = sex(tr) | |
result[:age] = age(tr) | |
result[:basis_weight] = trim_all( tr.search("td")[5].text ) | |
result[:jockey_id] = jockey_id(tr) | |
result[:finishing_time] = trim_all( tr.search("td")[7].text ) | |
result[:length] = trim_all( tr.search("td")[8].text ) | |
result[:speed_figure] = trim_all( tr.search("td")[9].text ) | |
result[:pass] = trim_all( tr.search("td")[10].text ) | |
result[:last_phase] = trim_all( tr.search("td")[11].text ) | |
result[:odds] = trim_all( tr.search("td")[12].text ) | |
result[:popularity] = trim_all( tr.search("td")[13].text ) | |
result[:horse_weight] = trim_all( tr.search("td")[14].text ) | |
result[:trainer_id] = trainer_id(tr) | |
result[:owner_id] = owner_id(tr) | |
result[:earning_money] = trim_all( tr.search("td")[20].text ) | |
results[i] = result | |
end | |
results.compact | |
end | |
def trim_all(text) | |
text.gsub(/[\s|\n| |\u00a0|\t]*/,"") | |
end | |
def diary_snap_cut(txt) | |
trim_all(txt.search("diary_snap_cut").text).match(/([^\d]+)(\d+)m\/天候:(.+)\/(.+)\/発走:(.+)/) | |
end | |
def racemeta(txt) | |
txt.text.split(" ") | |
end | |
def horse_id(txt) | |
txt.search("td")[3].search("a")[0][:href].match(/horse\/(\d+)/)[1] | |
end | |
def sex(txt) | |
_sex = trim_all( txt.search("td")[4].text )[0] | |
case _sex | |
when "牡" | |
return "牡" | |
when "牝" | |
return "牝" | |
when "セ" | |
return "セ" | |
end | |
end | |
def age(txt) | |
trim_all( txt.search("td")[4].text )[1] | |
end | |
def jockey_id(txt) | |
trim_all( txt.search("td")[6].search("a")[0][:href].match(/jockey\/(\d+)/)[1] ) | |
end | |
def trainer_id(txt) | |
trim_all( txt.search("td")[18].search("a")[0][:href].match(/trainer\/(\d+)/)[1] ) | |
end | |
def owner_id(txt) | |
trim_all( txt.search("td")[19].search("a")[0][:href].match(/owner\/(.+)\//)[1] ) | |
end | |
end | |
def log(text) | |
puts "[#{Time.now.strftime('%Y/%m/%d %H:%M')}] #{text}" | |
end | |
############## exec ############### | |
begin | |
scraper = NetKeibaScraper.new(true) | |
scraped_data = scraper.run | |
# bulk import | |
race_definitions = [] | |
scraped_data[0].each_with_index do |race_definition, i| | |
race_definitions << RaceDefinition.new(race_id: race_definition[:race_definition_id], race_name: race_definition[:race_name],surface: race_definition[:surface],distance: race_definition[:distance],weather: race_definition[:weather],surface_state: race_definition[:surface_state],race_start: race_definition[:race_start],race_number: race_definition[:race_number],surface_score: race_definition[:surface_score],date: race_definition[:date],place_detail: race_definition[:place_detail],race_class: race_definition[:race_class]) | |
if i % 200 == 0 | |
RaceDefinition.import race_definitions | |
race_definitions = [] | |
end | |
end | |
race_results = [] | |
scraped_data[1].each_with_index do |race_result_arr, i| | |
race_result_arr.each do |race_result| | |
race_results << RaceResult.new(race_definition_id: race_result[:race_definition_id], order_of_finish: race_result[:order_of_finish],frame_number: race_result[:frame_number],horse_number: race_result[:horse_number],horse_id: race_result[:horse_id],sex: race_result[:sex],age: race_result[:age],basis_weight: race_result[:basis_weight],jockey_id: race_result[:jockey_id],finishing_time: race_result[:finishing_time],length: race_result[:length],speed_figure: race_result[:speed_figure],pass: race_result[:pass],last_phase: race_result[:last_phase],odds: race_result[:odds],popularity: race_result[:popularity],horse_weight: race_result[:horse_weight],trainer_id: race_result[:trainer_id],owner_id: race_result[:owner_id],earning_money: race_result[:earning_money]) | |
end | |
if i % 200 == 0 | |
RaceResult.import race_results | |
race_results = [] | |
end | |
end | |
rescue => e | |
p e | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment