Skip to content

Instantly share code, notes, and snippets.

@caioertai
Last active July 11, 2018 09:56
Show Gist options
  • Save caioertai/aeb9ae10b12afd054f7d9c4ca7344bd3 to your computer and use it in GitHub Desktop.
Save caioertai/aeb9ae10b12afd054f7d9c4ca7344bd3 to your computer and use it in GitHub Desktop.
Movie scraper live code.
require_relative "scraper.rb"
puts "Fetching urls..."
urls = movie_getter
movies = urls.map do |url|
puts "Scraping #{url}"
scrape_movie(url)
end
movies.each_with_index do |movie, index|
puts "------------------------------------------"
puts "Top #{index + 1}"
puts " Title: #{movie[:title]}"
puts " Year: #{movie[:year]}"
puts " Director: #{movie[:director]}"
puts " Cast: #{movie[:cast]}"
puts " Storyline: #{movie[:storyline]}"
end
puts "------------------------------------------"
require "open-uri"
require "nokogiri"
def movie_getter
url = "http://www.imdb.com/chart/top"
# open file and run Nokogiri
html_file = URI.parse(url).open("Accept-Language" => "en")
# grab content
html = Nokogiri::HTML(html_file)
# return first 5 top movies url
top5 = html.search(".titleColumn a").first(5)
top5.map do |movie|
"http://www.imdb.com#{movie.attribute("href").value.gsub(/\?.*/, "")}"
end
end
def scrape_movie(url)
# open file and parse with nokogiri
file = URI.parse(url).open("Accept-Language" => "en")
# grab the content
doc = Nokogiri::HTML(file)
# title is inside (.title_wrapper h1) class
title = doc.search(".title_wrapper h1").text.gsub(/.\(.*/, "")
# year is inside id (#titleYear)
year = doc.search("#titleYear").text.gsub(/\W/, "")
# storyline is in (.summary_text)
storyline = doc.search(".summary_text").text.strip
# director is within (.credit_summary_item .itemprop)
director = doc.at(".credit_summary_item .itemprop").text
# cast is an array with 3 items
cast = doc.at("#title-overview-widget > div.plot_summary_wrapper > div.plot_summary > div:nth-child(4)").text
match_data = cast.match(/\A\s+Stars:\s+(\w.+),\s+(\w.+),\s+(\w.+?\s\w.+)\s/)
stars = [match_data[1], match_data[2], match_data[3].strip]
# .search returns an array
# .at return the first element found
# return a hash with the data
{
cast: stars,
director: director,
storyline: storyline,
title: title,
year: year.to_i
}
end
require_relative '../scraper.rb'
describe '#movie_getter' do
it 'should scrape imdb for top 5 movies' do
movies_expected = [
'http://www.imdb.com/title/tt0111161/',
'http://www.imdb.com/title/tt0068646/',
'http://www.imdb.com/title/tt0071562/',
'http://www.imdb.com/title/tt0468569/',
'http://www.imdb.com/title/tt0050083/'
]
actual = movie_getter
expect(actual).to eq(movies_expected)
end
end
describe '#scrape_movie' do
it 'should scrape movie url for expected data' do
expected_data = {
cast: [ 'Christian Bale', 'Heath Ledger', 'Aaron Eckhart' ],
director: 'Christopher Nolan',
storyline: 'When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.',
title: 'The Dark Knight',
year: 2008
}
actual = scrape_movie('http://www.imdb.com/title/tt0468569/')
expect(actual).to eq(expected_data)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment