Last active
July 14, 2020 21:39
-
-
Save Haumer/8a9cb24294c954418815d15b61cbb1bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require_relative 'scraper' | |
| puts "Fetching URLs" | |
| urls = get_urls | |
| movies = urls.map do |url| | |
| puts "Scraping #{url}" | |
| scrape_movie(url) | |
| end | |
| p movies |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'open-uri' | |
| require 'nokogiri' | |
| # Method to scrape the top five movies | |
| def get_urls | |
| url = 'https://www.imdb.com/chart/top' | |
| urls = [] | |
| imdb_html = open(url).read | |
| imdb = Nokogiri::HTML(imdb_html) | |
| imdb.search('.titleColumn a').first(5).each do |movie| | |
| urls << "http://www.imdb.com" + movie.attributes['href'].value | |
| end | |
| urls | |
| end | |
| # Method to scrape the info from a single movie | |
| def scrape_movie(url) | |
| # returns a hash | |
| movie = {} | |
| imdb_html = open(url).read | |
| imdb = Nokogiri::HTML(imdb_html) | |
| movie_info = imdb.search('h1').text.strip.split('(') | |
| movie[:title] = movie_info.first[0..-2] | |
| movie[:year] = movie_info.last[0..-2].to_i | |
| story = imdb.search('.summary_text').text.strip | |
| movie[:storyline] = story | |
| movie[:director] = imdb.search("h4:contains('Director') + a").text | |
| cast = [] | |
| imdb.search(".primary_photo a img").take(3).each do |member| | |
| cast << member.attributes["title"].value | |
| end | |
| movie[:cast] = cast | |
| movie | |
| end | |
| # scrape_movie('https://www.imdb.com/title/tt0468569/') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # rubocop:disable all | |
| require_relative '../scraper' | |
| describe '#get_urls' do | |
| it 'should return the correct array of urls' do | |
| expected = [ | |
| 'http://www.imdb.com/title/tt0111161/', | |
| 'http://www.imdb.com/title/tt0068646/', | |
| 'http://www.imdb.com/title/tt0071562/', | |
| 'http://www.imdb.com/title/tt0468569/', | |
| 'http://www.imdb.com/title/tt0050083/' | |
| ] | |
| actual = get_urls | |
| expect(actual).to eq(expected) | |
| end | |
| end | |
| describe '#scrape_movie' do | |
| it 'should return the correct movie hash' do | |
| expected = { | |
| cast: ['Christian Bale', 'Heath Ledger', 'Aaron Eckhart'], | |
| director: 'Christopher Nolan', | |
| storyline: 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.', | |
| title: 'The Dark Knight', | |
| year: 2008 | |
| } | |
| actual = scrape_movie('https://www.imdb.com/title/tt0468569/') | |
| expect(actual).to eq(expected) | |
| end | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment