Created
December 12, 2012 19:02
-
-
Save arjunvenkat/4270589 to your computer and use it in GitHub Desktop.
scraper to save chord information for songs on ultimate guitar
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'csv' | |
require 'Mechanize' | |
require 'awesome_print' | |
# letter_array = ['0-9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', | |
# 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', | |
# 'u', 'v', 'w', 'x', 'y', 'z'] | |
letter_array = ['b'] | |
letter_array.each do |letter| | |
i = 0 | |
k = 8 | |
letter_url = "http://www.ultimate-guitar.com/bands/#{letter}8.htm" # watch out to make sure k and the end of this url reset back to 1 and nothing, respectively | |
songs = [] # initialize an empty array of songs | |
agent = Mechanize.new | |
letter_page = agent.get(letter_url) # creates mechanize page for the first page of a particular letter | |
until letter_page == nil | |
table = letter_page.search('td[style="padding:8px"] table[cellpadding="2"]') # selects the table with band names/links | |
puts "page: #{k}" | |
table.css('tr').each_with_index do |row, j| | |
next if j == 0 # the first row in the table is blank | |
puts i | |
# next if i > 10 # make sure you comment this line out, just for limiting data during testing | |
band_name = row.css('td:nth-child(2) > a').text.rpartition(" Tabs")[0] # removes " Tabs" from the band name | |
band_page_url = row.css('td:nth-child(2) > a').attr('href').text | |
next if band_page_url.include?('tp.ultimate') || band_page_url.include?('lyricsmode') | |
puts "Checking #{band_name} at #{band_page_url}" | |
band_page = agent.get(band_page_url) # creates mechanzie page for a particular band | |
until band_page == nil | |
table = band_page.search('table[cellspacing="0"][cellpadding="2"]') #selects table with song names/links | |
table.css('tr').each do |row| | |
# next if row.css('td:nth-child(3)').blank? | |
if row.css('td:nth-child(3)').text == "Chords" # only move forward if the link is to a "chord" page | |
rating = 0 | |
if row.css('td:nth-child(2)').text.scan(/\[/)[0] == '[' # checks if there are any reviews | |
rating = row.css('td:nth-child(2) img').attr('src').text.rpartition('/r')[2].rpartition('.gif')[0].to_i # pulls out star rating | |
num_ratings = row.css('td:nth-child(2)').text.rpartition('[ ')[2].rpartition(' ]')[0].to_i # pulls out number of ratings | |
end | |
if rating > 3 && num_ratings > 2 | |
chord_page_link = row.css('td:nth-child(1) > a').attr('href').text # pulls out the url of a "chord" page for a song | |
puts chord_page_link | |
unless chord_page_link.include?("lyricsmode") | |
song_page = agent.get(chord_page_link) | |
song_title = song_page.search('td.fs-10 h1').text.rpartition(" Chords")[0] # removes " Chords" from the band name | |
song_chords_array = [] | |
song_page.search('pre').css('span').each do |chord| # goes through the chord page and pulls out all the chords. Conveniently, they happen to be in span tags. Be careful, because other stuff might be in span tags as well | |
song_chords_array << chord.text | |
end | |
song_chords_array.uniq! # makes sure items in the song_chords array aren't repeated | |
songs << [band_name, song_title, song_chords_array, rating, num_ratings, chord_page_link] | |
puts "#{song_title} saved" | |
end | |
end | |
end | |
end | |
if band_page.link_with(:text => /^Next/) != nil # checks to see if there is a next link on the bottom of the page | |
band_page = band_page.link_with(:text => /^Next/).click | |
else | |
band_page = nil # if mechanize has come to the last page for a band, sets band_page to nil | |
end | |
end | |
puts "#{band_name} completed \n " | |
i += 1 | |
end | |
CSV.open("#{letter}_bands_pg#{k}.csv", "wb") do |csv| | |
csv << ["Band Name", "Song Title", "Song Chords", "Rating", "Number of Ratings" "Chord Page Link"] | |
songs.each do |song| | |
csv << song | |
end | |
end | |
songs = [] | |
if letter_page.link_with(:text => /^Next/) != nil # checks to see if there is a next link on the bottom of the page | |
letter_page = letter_page.link_with(:text => /^Next/).click | |
else | |
letter_page = nil # if mechanize has come to the last page for a letter, sets letter_page to nil | |
end | |
k += 1 | |
end | |
puts "CSV saved for #{letter} bands" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment