Created
August 21, 2014 12:24
-
-
Save cmar/34cc01ac13d2bc0077d0 to your computer and use it in GitHub Desktop.
Build database of NFL Players using Ruby Threads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# builds database of NFL players | |
# | |
# Ruby script to parse the nfl.com/teams to build a list of teams | |
# then create a thread to parse each team roster | |
# | |
# This creates 32 threads, each appending to the all_players array | |
# it works well in ruby mri because of the global interpreter lock | |
# | |
# in the end it builds a players.csv with ~2900 players | |
# | |
# on my mac it runs ~12s, the CHALLENGE is to refactor the | |
# threading by changing the number of threads, locks or add a queue | |
# to improve the time | |
# | |
require 'open-uri' | |
require 'benchmark' | |
require 'csv' | |
require 'nokogiri' | |
NFL_URL = "http://www.nfl.com" | |
Player = Struct.new :team, :number, :first_name, :last_name, :position, | |
:height, :weight, :birthday, :experience, :college | |
def parse_team_roster(roster_link) | |
roster_doc = Nokogiri::HTML open(roster_link) | |
player_rows = roster_doc.css 'div#team-stats-wrapper table#result tbody:nth-child(2) tr' | |
team = roster_link[/(?<==)\w+$/] | |
players = [] | |
player_rows.each do |row| | |
player = Player.new team | |
player.number = row.css("td:nth-child(1)").text | |
name = row.css("td:nth-child(2) a").text.split(', ') # "Aiken, Kamar" | |
player.first_name = name.last | |
player.last_name = name.first | |
player.position = row.css("td:nth-child(3)").text | |
player.height = row.css("td:nth-child(5)").text.strip | |
player.weight = row.css("td:nth-child(6)").text | |
player.birthday = row.css("td:nth-child(7)").text | |
player.experience = row.css("td:nth-child(8)").text | |
player.college = row.css("td:nth-child(9)").text | |
players << player | |
end | |
players | |
end | |
def parse_roster_links | |
doc = Nokogiri::HTML open("#{NFL_URL}/teams") | |
team_links = doc.css("a[href^='/team']").map { |a| "#{NFL_URL}/#{a["href"]}" }.uniq | |
roster_links = team_links.map { |l| l.gsub('profile', 'roster') } | |
end | |
# Main | |
all_players = [] | |
threads = [] | |
roster_links = parse_roster_links | |
roster_links.each do |roster_link| | |
threads << Thread.new do | |
players = parse_team_roster roster_link | |
all_players.concat players | |
p "parsed #{roster_link}" | |
end | |
end | |
p "Starting #{threads.size} threads" | |
time = Benchmark.measure do | |
threads.each &:join | |
end | |
p "Found #{all_players.size} players" | |
p time | |
CSV.open("players.csv", "w+") do |csv| | |
all_players.each do |player| | |
csv << player.values | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment