Last active
July 11, 2018 09:56
-
-
Save caioertai/aeb9ae10b12afd054f7d9c4ca7344bd3 to your computer and use it in GitHub Desktop.
Movie scraper live code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative "scraper.rb" | |
puts "Fetching urls..." | |
urls = movie_getter | |
movies = urls.map do |url| | |
puts "Scraping #{url}" | |
scrape_movie(url) | |
end | |
movies.each_with_index do |movie, index| | |
puts "------------------------------------------" | |
puts "Top #{index + 1}" | |
puts " Title: #{movie[:title]}" | |
puts " Year: #{movie[:year]}" | |
puts " Director: #{movie[:director]}" | |
puts " Cast: #{movie[:cast]}" | |
puts " Storyline: #{movie[:storyline]}" | |
end | |
puts "------------------------------------------" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "open-uri" | |
require "nokogiri" | |
def movie_getter | |
url = "http://www.imdb.com/chart/top" | |
# open file and run Nokogiri | |
html_file = URI.parse(url).open("Accept-Language" => "en") | |
# grab content | |
html = Nokogiri::HTML(html_file) | |
# return first 5 top movies url | |
top5 = html.search(".titleColumn a").first(5) | |
top5.map do |movie| | |
"http://www.imdb.com#{movie.attribute("href").value.gsub(/\?.*/, "")}" | |
end | |
end | |
def scrape_movie(url) | |
# open file and parse with nokogiri | |
file = URI.parse(url).open("Accept-Language" => "en") | |
# grab the content | |
doc = Nokogiri::HTML(file) | |
# title is inside (.title_wrapper h1) class | |
title = doc.search(".title_wrapper h1").text.gsub(/.\(.*/, "") | |
# year is inside id (#titleYear) | |
year = doc.search("#titleYear").text.gsub(/\W/, "") | |
# storyline is in (.summary_text) | |
storyline = doc.search(".summary_text").text.strip | |
# director is within (.credit_summary_item .itemprop) | |
director = doc.at(".credit_summary_item .itemprop").text | |
# cast is an array with 3 items | |
cast = doc.at("#title-overview-widget > div.plot_summary_wrapper > div.plot_summary > div:nth-child(4)").text | |
match_data = cast.match(/\A\s+Stars:\s+(\w.+),\s+(\w.+),\s+(\w.+?\s\w.+)\s/) | |
stars = [match_data[1], match_data[2], match_data[3].strip] | |
# .search returns an array | |
# .at return the first element found | |
# return a hash with the data | |
{ | |
cast: stars, | |
director: director, | |
storyline: storyline, | |
title: title, | |
year: year.to_i | |
} | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative '../scraper.rb' | |
describe '#movie_getter' do | |
it 'should scrape imdb for top 5 movies' do | |
movies_expected = [ | |
'http://www.imdb.com/title/tt0111161/', | |
'http://www.imdb.com/title/tt0068646/', | |
'http://www.imdb.com/title/tt0071562/', | |
'http://www.imdb.com/title/tt0468569/', | |
'http://www.imdb.com/title/tt0050083/' | |
] | |
actual = movie_getter | |
expect(actual).to eq(movies_expected) | |
end | |
end | |
describe '#scrape_movie' do | |
it 'should scrape movie url for expected data' do | |
expected_data = { | |
cast: [ 'Christian Bale', 'Heath Ledger', 'Aaron Eckhart' ], | |
director: 'Christopher Nolan', | |
storyline: 'When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham. The Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.', | |
title: 'The Dark Knight', | |
year: 2008 | |
} | |
actual = scrape_movie('http://www.imdb.com/title/tt0468569/') | |
expect(actual).to eq(expected_data) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment