Skip to content

Instantly share code, notes, and snippets.

@p5k6
Last active August 29, 2015 13:57
Show Gist options
  • Select an option

  • Save p5k6/9395539 to your computer and use it in GitHub Desktop.

Select an option

Save p5k6/9395539 to your computer and use it in GitHub Desktop.
nhl scraper... still pretty basic
require 'wombat'
Pry.config.pager = false
set1 = Wombat.crawl do base_url "http://www.nhl.com"; path '/ice/news.htm?id=675589';
to_team 'xpath=//*[@id="cmstable_7607"]/tbody[1]/tr/td[3]//img/@src', :list
to_players 'xpath=//*[@id="cmstable_7607"]/tbody[1]/tr/td[2]', :list
from_team 'xpath=//*[@id="cmstable_7607"]/tbody[1]/tr/td[5]//img/@src', :list
end
set2 = Wombat.crawl do base_url "http://www.nhl.com"; path '/ice/news.htm?id=675589';
from_team 'xpath=//*[@id="cmstable_7607"]/tbody[1]/tr/td[3]//img/@src', :list
to_players 'xpath=//*[@id="cmstable_7607"]/tbody[1]/tr/td[6]', :list
to_team 'xpath=//*[@id="cmstable_7607"]/tbody[1]/tr/td[5]//img/@src', :list
end
#### the regex: \/[a-z]+_(logo|dark)
trades1 = []
trades2 = []
set1['to_team'].each { |t| trades1 << { "to_team" => t[/\/([a-z]+)_(logo|dark)/, 1] } }
set1['from_team'].each_with_index { |t,i| trades1[i]['from_team'] = t[/\/([a-z]+)_(logo|dark)/, 1] }
set1['to_players'].each_with_index { |t,i| trades1[i]["players"] = t }
set2['to_team'].each { |t| trades2 << { "to_team" => t[/\/([a-z]+)_(logo|dark)/, 1] } }
set2['from_team'].each_with_index { |t,i| trades2[i]["from_team"] = t[/\/([a-z]+)_(logo|dark)/, 1] }
set2['to_players'].each_with_index { |t,i| trades2[i]["players"] = t }
trades = trades1 + trades2
## remember this: Pry.config.pager = false
my_trades = trades.select { |t| t['players'].split("\r\n").length == 1 }
tmp_ar = []
trades.each { |t|
tmp = t['players'].split("\r\n")
if tmp.length == 1
nil
else
tmp.each { |t2|
tmp_ar << { 'to_team' => t['to_team'], 'from_team' => t['from_team'], 'players' => t2.strip }
}
end
}
all_trades_one_way = my_trades.concat(tmp_ar).map { |t| { 'from_team' => t['from_team'], 'to_team' => t['to_team'], 'player' => t['players'].split(",").first[/(^[FGD] |^)(.*)/, 2] } }
## scrape capgeek
salaries_raw = []
for n in 10..3325
salaries_raw << Wombat.crawl do base_url "http://www.capgeek.com"; path "/player/#{n}";
player 'xpath=//title', :list
salary 'xpath=//tr[@class="odd" and td = "2013-14"]', :list
end.merge("id" => n)
sleep 10
end
Pry.config.pager = true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment