Skip to content

Instantly share code, notes, and snippets.

@YuheiNakasaka
Last active January 18, 2016 12:33
Show Gist options
  • Save YuheiNakasaka/aa75bbbb0d6072f97359 to your computer and use it in GitHub Desktop.
Save YuheiNakasaka/aa75bbbb0d6072f97359 to your computer and use it in GitHub Desktop.
# coding: utf-8
# net keiba scraper
# https://github.com/stockedge/netkeiba-scraper のsbt "run extract"でエラーでるのでRubyでscrapingしてDBに突っ込むとこまでスクリプトにした。
# 前提として、net_keiba_htmls配下にsbt "run scrapehtml"で取得したhtmlファイルをここに全部置くこと
# あとは下記のディレクトリ構成にして、mysqlでDBをつくって、以下のテーブル作って、このスクリプトを実行すればよいはず
# ディレクトリ構成
# ./
# |_ Gemfile
# |_ net_keiba_scraper.rb
# |_ config
# |_ database.yml
# |_ net_keiba_htmls
# Gemfile
# source 'https://rubygems.org'
# gem 'activerecord'
# gem 'mysql2'
# gem 'activerecord-import'
# DB
# CREATE TABLE `race_definitions` (
# `race_id` bigint(20) DEFAULT NULL,
# `race_name` varchar(255) DEFAULT NULL,
# `surface` varchar(255) DEFAULT NULL,
# `distance` int(11) DEFAULT NULL,
# `weather` varchar(255) DEFAULT NULL,
# `surface_state` varchar(255) DEFAULT NULL,
# `race_start` varchar(255) DEFAULT NULL,
# `race_number` int(11) DEFAULT NULL,
# `surface_score` varchar(255) DEFAULT NULL,
# `date` varchar(255) DEFAULT NULL,
# `place_detail` varchar(255) DEFAULT NULL,
# `race_class` varchar(255) DEFAULT NULL
# ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
#
#
# CREATE TABLE `race_results` (
# `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
# `race_definition_id` bigint(20) DEFAULT NULL,
# `order_of_finish` int(11) DEFAULT NULL,
# `frame_number` int(11) DEFAULT NULL,
# `horse_number` int(11) DEFAULT NULL,
# `horse_id` varchar(255) DEFAULT NULL,
# `sex` varchar(255) DEFAULT NULL,
# `age` int(11) DEFAULT NULL,
# `basis_weight` int(11) DEFAULT NULL,
# `jockey_id` varchar(255) DEFAULT NULL,
# `finishing_time` varchar(255) DEFAULT NULL,
# `length` varchar(255) DEFAULT NULL,
# `speed_figure` int(11) DEFAULT NULL,
# `pass` varchar(255) DEFAULT NULL,
# `last_phase` varchar(255) DEFAULT NULL,
# `odds` int(11) DEFAULT NULL,
# `popularity` int(11) DEFAULT NULL,
# `horse_weight` varchar(255) DEFAULT NULL,
# `trainer_id` varchar(255) DEFAULT NULL,
# `owner_id` varchar(255) DEFAULT NULL,
# `earning_money` varchar(255) DEFAULT NULL,
# PRIMARY KEY (`id`)
# ) ENGINE=InnoDB AUTO_INCREMENT=494700 DEFAULT CHARSET=utf8;
require 'bundler/setup'
require 'nokogiri'
require 'csv'
require 'kconv'
require 'active_record'
require 'mysql2'
require 'activerecord-import'
require 'yaml'
config = YAML.load_file("#{File.expand_path('../../', __FILE__)}/config/database.yml")
ActiveRecord::Base.establish_connection(config["db"]["development"])
class RaceDefinition < ActiveRecord::Base
self.primary_key = :race_id
has_many :race_results, foreign_key: :race_definition_id, primary_key: :race_id
end
class RaceResult < ActiveRecord::Base
belongs_to :race_definition, foreign_key: :race_id
end
class NetKeibaScraper
def initialize(cwl=false)
@cwl = cwl
@output_file_dir = "#{File.expand_path('../../', __FILE__)}/net_keiba_htmls"
end
def run
if @cwl == true
log('Start')
race_info = []
race_results = []
# stopper = 0
Dir.glob(@output_file_dir + "/*.html").each do |f|
# ざっくり
race_definition_id = File.basename(f).sub(/\.html/,'')
# next if stopper == 0 && race_definition_id != '201008050205'
# stopper = 1
html = open(f, "r:binary").read
parsed_html = Nokogiri::HTML.parse(html.toutf8, nil, 'utf-8')
# 必要なデータをスクレイピング
next if race_definition_id.match(/(200808020398|200808020399)/)
log("race_definition_id: #{race_definition_id}")
race_info << get_race_definition(parsed_html, race_definition_id)
race_results << get_race_result(parsed_html, race_definition_id)
end
[race_info, race_results]
end
end
private
def get_race_definition(txt, race_definition_id)
results = {}
racedata = txt.search(".racedata.fc")
racedata_mini = txt.search(".smalltxt")
raceplaceinfo = txt.search(".result_table_02 tbody")
results[:race_definition_id] = race_definition_id
results[:race_name] = trim_all(racedata.search("h1").text)
results[:surface] = diary_snap_cut(racedata)[1]
results[:distance] = diary_snap_cut(racedata)[2]
results[:weather] = diary_snap_cut(racedata)[3]
results[:surface_state] = diary_snap_cut(racedata)[4]
results[:race_start] = diary_snap_cut(racedata)[5]
results[:race_number] = trim_all(racedata.search("dt").text).sub(/R/,"")
results[:surface_score] = trim_all(raceplaceinfo.search("tr td")[0].text.sub(/[^\d]*\?[^\d]*/,''))
results[:date] = racemeta(racedata_mini)[0]
results[:place_detail] = racemeta(racedata_mini)[1]
results[:race_class] = trim_all(racemeta(racedata_mini)[2])
results
end
def get_race_result(txt, race_definition_id)
results = []
txt.search("table.race_table_01 tr").each_with_index do |tr, i|
next if i == 0
result = {}
result[:race_definition_id] = race_definition_id
result[:order_of_finish] = trim_all( tr.search("td")[0].text )
result[:frame_number] = trim_all( tr.search("td")[1].text )
result[:horse_number] = trim_all( tr.search("td")[2].text )
result[:horse_id] = horse_id(tr)
result[:sex] = sex(tr)
result[:age] = age(tr)
result[:basis_weight] = trim_all( tr.search("td")[5].text )
result[:jockey_id] = jockey_id(tr)
result[:finishing_time] = trim_all( tr.search("td")[7].text )
result[:length] = trim_all( tr.search("td")[8].text )
result[:speed_figure] = trim_all( tr.search("td")[9].text )
result[:pass] = trim_all( tr.search("td")[10].text )
result[:last_phase] = trim_all( tr.search("td")[11].text )
result[:odds] = trim_all( tr.search("td")[12].text )
result[:popularity] = trim_all( tr.search("td")[13].text )
result[:horse_weight] = trim_all( tr.search("td")[14].text )
result[:trainer_id] = trainer_id(tr)
result[:owner_id] = owner_id(tr)
result[:earning_money] = trim_all( tr.search("td")[20].text )
results[i] = result
end
results.compact
end
def trim_all(text)
text.gsub(/[\s|\n| |\u00a0|\t]*/,"")
end
def diary_snap_cut(txt)
trim_all(txt.search("diary_snap_cut").text).match(/([^\d]+)(\d+)m\/天候:(.+)\/(.+)\/発走:(.+)/)
end
def racemeta(txt)
txt.text.split(" ")
end
def horse_id(txt)
txt.search("td")[3].search("a")[0][:href].match(/horse\/(\d+)/)[1]
end
def sex(txt)
_sex = trim_all( txt.search("td")[4].text )[0]
case _sex
when "牡"
return "牡"
when "牝"
return "牝"
when "セ"
return "セ"
end
end
def age(txt)
trim_all( txt.search("td")[4].text )[1]
end
def jockey_id(txt)
trim_all( txt.search("td")[6].search("a")[0][:href].match(/jockey\/(\d+)/)[1] )
end
def trainer_id(txt)
trim_all( txt.search("td")[18].search("a")[0][:href].match(/trainer\/(\d+)/)[1] )
end
def owner_id(txt)
trim_all( txt.search("td")[19].search("a")[0][:href].match(/owner\/(.+)\//)[1] )
end
end
def log(text)
puts "[#{Time.now.strftime('%Y/%m/%d %H:%M')}] #{text}"
end
############## exec ###############
begin
scraper = NetKeibaScraper.new(true)
scraped_data = scraper.run
# bulk import
race_definitions = []
scraped_data[0].each_with_index do |race_definition, i|
race_definitions << RaceDefinition.new(race_id: race_definition[:race_definition_id], race_name: race_definition[:race_name],surface: race_definition[:surface],distance: race_definition[:distance],weather: race_definition[:weather],surface_state: race_definition[:surface_state],race_start: race_definition[:race_start],race_number: race_definition[:race_number],surface_score: race_definition[:surface_score],date: race_definition[:date],place_detail: race_definition[:place_detail],race_class: race_definition[:race_class])
if i % 200 == 0
RaceDefinition.import race_definitions
race_definitions = []
end
end
race_results = []
scraped_data[1].each_with_index do |race_result_arr, i|
race_result_arr.each do |race_result|
race_results << RaceResult.new(race_definition_id: race_result[:race_definition_id], order_of_finish: race_result[:order_of_finish],frame_number: race_result[:frame_number],horse_number: race_result[:horse_number],horse_id: race_result[:horse_id],sex: race_result[:sex],age: race_result[:age],basis_weight: race_result[:basis_weight],jockey_id: race_result[:jockey_id],finishing_time: race_result[:finishing_time],length: race_result[:length],speed_figure: race_result[:speed_figure],pass: race_result[:pass],last_phase: race_result[:last_phase],odds: race_result[:odds],popularity: race_result[:popularity],horse_weight: race_result[:horse_weight],trainer_id: race_result[:trainer_id],owner_id: race_result[:owner_id],earning_money: race_result[:earning_money])
end
if i % 200 == 0
RaceResult.import race_results
race_results = []
end
end
rescue => e
p e
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment